Wikipedia:WikiProject Interlanguage Links/Scripts
Appearance
Into a mysql database load the page and langlinks database dump files from two or more wikipedia Wikipedia:Database_download.
Extract interlanguage links into a single table for anaylysis
CREATE TABLE inter (
from_lang varchar(10),
from_title varchar(255),
to_lang varchar(10),
to_title varchar(255)
);
USE en;
INSERT INTO inter.inter
SELECT 'en', page.page_title, langlinks.ll_lang, langlinks.ll_title
FROM page, langlinks
WHERE page.page_id = langlinks.ll_from
AND page.page_namespace = 0;
USE de;
INSERT INTO inter.inter
SELECT 'de', page.page_title, langlinks.ll_lang, langlinks.ll_title
FROM page, langlinks
WHERE page.page_id = langlinks.ll_from
AND page.page_namespace = 0;
... etc
Tidy up this table and index it:
update inter set from_title = replace( from_title, '_', ' ' ); update inter set to_title = replace( to_title, '_', ' ' ); // Clear nonsensical entries -- entries to nonsensical languages -- entries to blank titles -- entries to unlikely titles -- poss entries to nonexistant articles ? ALTER TABLE inter ADD INDEX ( from_lang, from_title ); ALTER TABLE inter ADD INDEX ( to_lang, to_title );
Extract the information required, ie:
// suggested reciporcal links
// a:x -> b:x and b:x exists and no link from b:? to a:x exists => b:x -> a:x
INSERT INTO suggestions
SELECT a.to_lang, a.to_title, a.from_lang, a.from_title
FROM inter a
INNER JOIN en.page p
ON p.page_title = a.to_title
AND p.page_namespace = 0
AND p.page_is_redirect = 0
LEFT JOIN inter b
ON b.from_lang = a.to_lang
AND b.from_title = a.to_title
AND b.to_lang = a.from_lang
WHERE a.from_lang IN ('de','es','fr','it','nl','ja','pl','pt','sv' )
AND a.to_lang = 'en'
AND b.from_lang IS NULL;
// Interlanguage links to pages that do not exist
// en -> fr only
INSERT INTO suggestions
SELECT a.from_lang, a.from_title, a.to_lang, a.to_title
FROM inter a
INNER JOIN en.page f
ON f.page_title = a.from_title
AND f.page_namespace = 0
AND f.page_is_redirect = 0
LEFT JOIN fr.page t
ON t.page_title = a.to_title
AND t.page_namespace = 0
WHERE a.from_lang = 'en'
AND a.to_lang = 'fr'
AND t.page_title IS NULL
// Interlanguage links to redirects
// en -> fr only
SELECT a.from_lang, a.from_title, a.to_lang, a.to_title
FROM inter a
INNER JOIN en.page f
ON f.page_title = a.from_title
AND f.page_namespace = 0
AND f.page_is_redirect = 0
INNER JOIN fr.page t
ON a.to_title = t.page_title
AND t.page_namespace = 0
AND t.page_is_redirect = 1
WHERE a.from_lang = 'en'
AND a.to_lang = 'fr';
Finally, extract these suggestions in a human-readable format
DROP PROCEDURE IF EXISTS report_suggestions;
DELIMITER //
CREATE PROCEDURE report_suggestions( group_size INT, flang VARCHAR(10), tlang VARCHAR(10) )
BEGIN
DECLARE sug_pos, sug_base, done INT;
DECLARE ftitle, ttitle VARCHAR(255);
DECLARE sug CURSOR FOR SELECT DISTINCT from_title, to_title FROM suggestions WHERE from_lang = flang AND to_lang = tlang;
DECLARE CONTINUE HANDLER FOR SQLSTATE '23000' SET done = 1;
SET sug_pos = 0;
SET sug_base = 0;
SET done = 0;
OPEN sug;
REPEAT
FETCH sug INTO ftitle, ttitle;
IF NOT done THEN
IF sug_pos = 0 THEN
SELECT concat( '=== ', sug_base, ' - ', sug_base + group_size - 1, ' ===' );
SET sug_base = sug_base + group_size;
SET sug_pos = group_size - 1;
ELSE
SET sug_pos = sug_pos - 1;
END IF;
SELECT concat( '*[[', ftitle, ']] → [[:', tlang, ':', ttitle, ']]' );
END IF;
UNTIL done END REPEAT;
CLOSE sug;
END;
//
DELIMITER ;
call report_suggestions( 10, 'en', 'sv' );