Jump to content

Wikipedia:WikiProject Interlanguage Links/Scripts

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Topbanana (talk | contribs) at 12:53, 19 August 2007 (Created page with 'Into a mysaql database load the ''page'' and ''langlinks'' database dump files from two or more wikipedia Wikipedia:Database_download. Extra...'). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)

Into a mysaql database load the page and langlinks database dump files from two or more wikipedia Wikipedia:Database_download.

Extract interlanguage links into a single table for anaylysis

CREATE TABLE inter (
	from_lang	varchar(10),
	from_title	varchar(255),
	to_lang	varchar(10),
	to_title	varchar(255)
	);

USE en;

INSERT INTO inter.inter
SELECT 'en', page.page_title, langlinks.ll_lang, langlinks.ll_title
FROM page, langlinks
WHERE page.page_id = langlinks.ll_from
AND   page.page_namespace = 0;

USE de;

INSERT INTO inter.inter
SELECT 'de', page.page_title, langlinks.ll_lang, langlinks.ll_title
FROM page, langlinks
WHERE page.page_id = langlinks.ll_from
AND   page.page_namespace = 0;

... etc

Tidy up this table and index it:




update inter set from_title = replace( from_title, '_', ' ' );
update inter set to_title = replace( to_title, '_', ' ' );

// Clear nonsensical entries
 -- entries to nonsensical languages
 -- entries to blank titles
 -- entries to unlikely titles
 -- poss entries to nonexistant articles ?

ALTER TABLE inter ADD INDEX ( from_lang, from_title );
ALTER TABLE inter ADD INDEX ( to_lang, to_title );


And extract suggested red links, ie:

CREATE TABLE suggestions (
	from_lang	varchar(10),
	from_title	varchar(255),
	to_lang	varchar(10),
	to_title	varchar(255)
	);

INSERT INTO suggestions
SELECT DISTINCT a.from_lang, a.from_title, b.to_lang, b.to_title
FROM inter a 
INNER JOIN inter b
	ON   a.to_lang = b.from_lang
	AND  a.to_title = b.from_title
LEFT JOIN inter c
	ON   c.from_lang = a.from_lang
	AND  c.from_title = a.from_title
	AND  c.to_lang = b.to_lang  // Not c.to_title - any link from a to c existsing is sufficient
WHERE a.from_lang = 'en'
AND   b.to_lang != a.from_lang
AND   c.from_lang IS NULL;

// a:x -> b:x and b:x exists and no link from b:? to a:x exists => b:x -> a:x

SELECT concat( '*[[', a.to_title, ']] → [[:', a.from_lang, ':', a.from_title, ']]'  )
FROM inter a
INNER JOIN en.page p
    ON p.page_title = a.to_title
    AND p.page_namespace = 0
    AND p.page_is_redirect = 0
LEFT JOIN inter b
    ON   b.from_lang = a.to_lang
    AND  b.from_title = a.to_title     
    AND  b.to_lang = a.from_lang
WHERE a.from_lang = 'sv'
AND   a.to_lang = 'en'
AND   b.from_lang IS NULL;