Jump to content

User:Opencooper/showKanji-dev.js

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// This script shows, if found, the kanji and kana for an article
// It then calls another script, bindKana.js, to clean up the display of ruby
// For configuration, please see the documentation

// TODO: Reject if any base/reading has too low/high of a ratio.
// TODO: Reject if unbalanced parenthesis count
// TODO: <rb> is not actually in the whatcg standard...

// License: CC0

/* Sample pages:
    https://en.wikipedia.org/wiki/Tamio_Kawachi - kana on wikidata
    https://en.wikipedia.org/wiki/A_Fantastic_Tale_of_Naruto - kanji from wikidata only
    https://en.wikipedia.org/wiki/What_a_Wonderful_World! - kana from wikidata only
    https://en.wikipedia.org/wiki/Asako_I_%26_II - from redirect
    https://en.wikipedia.org/wiki/Bokura_ga_Ita_(film) - interwiki to subsection
    https://ja.wikipedia.org/wiki/%E7%B4%AF - kana part of bolded title
    https://en.wikipedia.org/wiki/Bokutachi_no_Koukan_Nikki - kana not in first sentence
    https://en.wikipedia.org/wiki/Domestic_Girlfriend - first full sentence not lead
    https://ja.wikipedia.org/wiki/SCP%E8%B2%A1%E5%9B%A3 - bolded term w/ kana past first sentence
    https://en.wikipedia.org/wiki/Nuclear_fusion - different term w/ kana in lead
    https://en.wikipedia.org/wiki/Oedipus_Rex - other stuff in kana
    https://en.wikipedia.org/wiki/20th_Century_Boys - overcapturing because title is subset
    https://en.wikipedia.org/wiki/Seiza - kana not at start of parenthesis
    https://en.wikipedia.org/wiki/Indentation_style - other kana in disambiguation
    https://en.wikipedia.org/wiki/Haven%27t_You_Heard%3F_I%27m_Sakamoto - Halfwidth-fullwidth difference
    https://en.wikipedia.org/wiki/Kanji_Furutachi - kanji only
    https://en.wikipedia.org/wiki/Anata_e - hiragana only
    https://en.wikipedia.org/wiki/Anatahan_(film) - katakana only
    https://en.wikipedia.org/wiki/A.LI.CE - latin only
    https://en.wikipedia.org/wiki/0.5_mm - numeric
    https://en.wikipedia.org/wiki/Truth_Coming_Out_of_Her_Well - angle brackets
    https://en.wikipedia.org/wiki/South_of_the_Border,_West_of_the_Sun - kana contains comma
    https://en.wikipedia.org/wiki/Leap_year - multiple kana separated by comma
    https://en.wikipedia.org/wiki/Do_You_Love_Your_Mom_and_Her_Two-Hit_Multi-Target_Attacks%3F - question mark
    https://en.wikipedia.org/wiki/Comic_Magazine - exclamation point
    https://en.wikipedia.org/wiki/Tsurune - dash
    https://en.wikipedia.org/wiki/Flare_(film) - wave dash
    https://en.wikipedia.org/wiki/Dog%C3%97Police - multiplication sign
    https://en.wikipedia.org/wiki/Foreboding_(film) - spaces
    https://en.wikipedia.org/wiki/Age_12 - period in title
    https://en.wikipedia.org/wiki/Suzukake_Nanchara - very long kanji
    https://en.wikipedia.org/wiki/After_the_Rain_(manga) - kanji + hiragana
    https://en.wikipedia.org/wiki/Afro_Tanaka - kanji + katakana
    https://en.wikipedia.org/wiki/Battle_Girl:_The_Living_Dead_in_Tokyo_Bay - katakana + latin
    https://en.wikipedia.org/wiki/Calling_You_(short_story_collection) - kanji + hiragana + latin
    https://en.wikipedia.org/wiki/Ashita_no_Joe - hiragana + katakana
    https://en.wikipedia.org/wiki/Arcadia_of_My_Youth - kanji + hiragana + katakana
    https://en.wikipedia.org/wiki/Haou_Airen - special character
    https://ja.wikipedia.org/wiki/%E6%98%A0%E7%94%BB_%E8%81%B2%E3%81%AE%E5%BD%A2 - reference in between
    https://en.wikipedia.org/wiki/Ninjō - No interlanguage, but wiktionary
    https://en.wikipedia.org/wiki/Seiza - Interlanguage failed, but wiktionary
    https://en.wikipedia.org/wiki/Epsomite - No interlanguage, but wiktionary "see" Table
    https://en.wikipedia.org/wiki/Bakayaro!_I%27m_Plenty_Mad - only part of parenthesis extracted

    https://en.wikipedia.org/wiki/ORCID
    https://en.wikipedia.org/wiki/Survive_Style_5%2B - fails due to +
    https://en.wikipedia.org/wiki/Ko-Shint%C5%8D
    https://ja.wikipedia.org/wiki/Terminate_and_Stay_Resident
    https://en.wikipedia.org/wiki/Ikk%C5%8D-sh%C5%AB
    https://en.wikipedia.org/wiki/Kakegoe - doesn't find jawiki interlanguage
    https://en.wikipedia.org/wiki/Love_Live!_The_School_Idol_Movie - interpunct in reading
    https://en.wikipedia.org/wiki/Lupin_the_Third:_The_Woman_Called_Fujiko_Mine - hyphen in kanji
    https://en.wikipedia.org/wiki/Sunscreen
    https://en.wikipedia.org/wiki/Flag_of_China
    https://en.wikipedia.org/wiki/W3m
    https://en.wikipedia.org/wiki/Magnum_Collection_1999_%22Dear%22
    https://en.wikipedia.org/wiki/EC_Comics
    https://en.wikipedia.org/wiki/CJK_characters
    https://en.wikipedia.org/wiki/My_Girlfriend_is_Shobitch
    https://en.wikipedia.org/wiki/Immaculate_Conception_Cathedral,_Nagasaki - partial match
    https://en.wikipedia.org/wiki/USA-224 - または
    https://en.wikipedia.org/wiki/Milk - bad match
    https://en.wikipedia.org/wiki/Not_invented_here
*/

function setup() {
    // If we're not reading an article, do nothing
    if (!(mw.config.get( 'wgAction' ) === 'view'
          && mw.config.get( 'wgIsArticle' )
          && !location.search.split('oldid=')[1]
          && !mw.config.get("wgIsMainPage")
          && mw.config.get("wgContentLanguage") !== "ja")) {
        return;
    }

    // Assuming that if there's no wikidata, there're no 1:1 interlanguage links,
    // and we don't want cases where a page links to a subsection of a jawiki
    // article
    if (wikidataId === null) {
        return;
    }

    // Placeholder so other elements don't push it down later
    var header;
    if ($('#firstHeading').length) { // Vector
    	header = $('#firstHeading');
    } else if ($('.page-heading').length) { // Minerva
    	header =  $('.page-heading');
    } else {
    	console.error("showKanji-dev.js: Couldn't find a page heading. This skin ("
    	              + mw.config.get( 'skin' ) + ") might not be supported.");
    	return;
    }
    header.append("<div id='kanjiInfo' lang='ja' dir='ltr'></div>");

    // Get the Japanese label from wikidata
    // API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities
    $.ajax({
        url: "https://www.wikidata.org/w/api.php",
        data: {
            action: "wbgetentities",
            ids: wikidataId,
            props: "labels",
            languages: "ja",
            format: "json",
            origin: "*"
        },
        success: parseJaLabel
    });
}

function parseJaLabel(response) {
    var wikidataInfo = response.entities[wikidataId];
    var jaLabel;
    if (!jQuery.isEmptyObject(wikidataInfo.labels.ja)) {
        jaLabel = wikidataInfo.labels.ja.value;
    }

    if (jaLabel) {
    	jaLabel = jaLabel.toHalfWidth();
        console.log("showKanji-dev.js: kanji: `" + jaLabel + "`");
        buildRegexes(jaLabel);
        displayKanji(jaLabel);
    } else {
        return;
    }

    // If the japanese title is not just only kana, get the reading
    if (!kanjiRegexes.kanaOnly.test(jaLabel)) {
        requestKana();
    }
}

function buildRegexes(kanji) {
    // Strip $kanji of all kanji and kana, adding whatever is left to the regex
    var reKanjiKana = /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴー-]/g;
    var kanjiStripped = kanji.replace(reKanjiKana, "");
    kanjiStripped += " ";
    // Need to add hyphen escaped since it has special behavior in regex classes
    // TODO: Just escape $kanji early instead, like we did before?
    kanjiStripped += "\\-";
    var kanjiAuxillary = kanjiStripped.replace(/\w/g, "");

    kanjiRegexes.latinOnly = /^[A-Za-z0-9\-.?!/,:;@#$%&+=*'"・ ]+$/;
    kanjiRegexes.kanaOnly = new RegExp("^[ぁ-ゔァ-ヴー" + kanjiAuxillary + "]+$");
    kanjiRegexes.hiraganaOnly = new RegExp("^[ぁ-ゔーA-Za-z" + kanjiAuxillary + "]+$");
    kanjiRegexes.katakanaOnly = new RegExp("^[ァ-ヴーA-Za-z" + kanjiAuxillary + "]+$");

    // Add midpoint for Latin in titles
    if (/\w/.test(kanji)) { kanjiStripped += "・"; }
    console.log("showKanji-dev.js: stripped: `" + kanjiStripped + "`");

    var leadReBase = "([ぁ-ゔァ-ヴー" + kanjiStripped + "]+)";
    var kanjiEscaped = mw.util.escapeRegExp(kanji);
    // Account for spaces, but ignore backslash and other misc characters
    var reKanjiKanaLatin = /([\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴーA-Za-z0-9])/g;
    var kanjiSpaced = kanjiEscaped.replace(/ /g, " ?");
    kanjiSpaced = kanjiSpaced.replace(reKanjiKanaLatin, "$1 ?");

    // Add kanji to regex to make sure we're not getting the reading of some
    // other term
    kanjiRegexes.leadUnspaced = new RegExp(kanjiEscaped + "[^(\n)]*?\\(" + leadReBase);
    kanjiRegexes.lead = new RegExp(kanjiSpaced + "[^(\n)]*?\\(" + leadReBase, "i"); // brittle
}

function displayKanji(kanji) {
	wikidataKanji = kanji;
    $('#kanjiInfo').append("<ruby>" + kanji + "</ruby>");

    // Add some classes so users can choose to not display for example
    // katakana-only kanji in their CSS
    if (kanjiRegexes.latinOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-latin-only");
        $("#kanjiInfo").prop("title", "Japanese title in Latin script");
        $("#kanjiInfo").css("display", "none");
    } else if (kanjiRegexes.hiraganaOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-hiragana-only");
        $("#kanjiInfo").prop("title", "Japanese title in hiragana");
    } else if (kanjiRegexes.katakanaOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-katakana-only");
        $("#kanjiInfo").prop("title", "Japanese title in katakana");
    } else {
    	$("#kanjiInfo").prop("title", "Japanese title in kanji");
    }
}

function requestKana() {
    // API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetclaims
    // We have to wholesale get all the claims instead of just one because the
    // kana might be present as a qualifier to another claim
    $.ajax({
        url: "https://www.wikidata.org/w/api.php",
        data: {
            action: "wbgetclaims",
            entity: wikidataId,
            format: "json",
            origin: "*"
        },
        success: parseKanaClaim
    });
}

function parseKanaClaim(response) {
    var kana;
    var properties = {
    	                 title: "P1476",
                         nativeLabel: "P1705",
                         officialName: "P1448",
    	                 nameInNativeLanguage: "P1559"
                     };
    var nameInKana = "P1814";
    
    // Try getting nameInKana as a qualifier to some properties                  
    for (var prop in properties) {
    	var pnum = properties[prop];
    	
    	if (response.claims[pnum]) {
            var kanji = response.claims[pnum][0].mainsnak.datavalue.value.text;
            if (kanji.replace(/ /g, "") == wikidataKanji.replace(/ /g, "")
                && response.claims[pnum][0].qualifiers
                && response.claims[pnum][0].qualifiers[nameInKana]) {
                kana = response.claims[pnum][0].qualifiers[nameInKana][0].datavalue.value;
    	        break;
            }
    	}
    }

    // Try getting nameInKana as a general claim
    if (!kana && response.claims[nameInKana]) {
    	prop = "nameInKana";
        kana = response.claims[nameInKana][0].mainsnak.datavalue.value;
    }
    
    // We couldn't find nameInKana
    if (!kana) {
        getInterlanguage();
        return;
    }

    kana = kana.toHalfWidth();
    displayKana(kana);
    $("#kanjiInfo").addClass("kanjiInfo-wikidata");
    $("#kanjiInfo").addClass("kanjiInfo-wikidata-" + prop);
}

function getInterlanguage() {
    var apiUrl = location.origin + "/w/api.php";
    // Documentation: https://en.wikipedia.org/w/api.php?action=help&modules=query%2Blanglinks
    $.ajax({
        url: apiUrl,
        data: {
            action: "query",
            format: "json",
            prop: "langlinks",
            lllang: "ja",
            titles: mw.config.get( 'wgTitle' )
        },
        success: function(response) {
        	var pageId = mw.config.get( 'wgArticleId' );
        	var page = response.query.pages[pageId];
            var langlinks = page ? page.langlinks : undefined;
        	var jaLabel;
        	if (langlinks) {
        	    jaLabel = langlinks[0]["*"];
        	    jaLabel = jaLabel.replace(/(.*)#.*/, "$1"); // rm anchors
        	} else {
        		getWiktionary();
        		return;
        	}
        	scrapeKana(jaLabel);
        }
    });
}

function scrapeKana(jaLabel) {
    // Get jawiki article's lead wikitext
    // API docs: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bextracts
    $.ajax({
        url: "https://ja.wikipedia.org/w/api.php",
        data: {
            action: "query",
            prop: "extracts",
            format: "json",
            redirects: true,
            exintro: true,
            exsentences: 2,
            exlimit: 1,
            explaintext: true,
            titles: jaLabel,
            origin: "*"
        },
        success: getFirstSentence
    });
}

function getFirstSentence(response) {
    var responsePart = response.query.pages;
    // Have to split parsing into two parts since jawiki pageid is unknown
    var pageId = Object.keys(responsePart)[0];
    var introText = responsePart[pageId].extract;

    if (!introText) {
        console.error("showKanji-dev.js: TextExtracts failed to get a lead for the Japanese article.");
        getWiktionary();
        return;
    }

    var wikitext = introText.toHalfWidth();

    console.log("showKanji-dev.js: lead: `" + wikitext + "`");
    console.log("showKanji-dev.js: regex: `" + kanjiRegexes.lead + "`");
    console.log("showKanji-dev.js: regex (unspaced): `" + kanjiRegexes.leadUnspaced + "`");

    var kana;
    var kanaSearch = wikitext.match(kanjiRegexes.lead);
    if (kanaSearch && kanaSearch.length == 2) {
        kana = kanaSearch[1];
    } else {
    	getWiktionary();
        return;
    }

    // Rm trailing characters
    kana = kana.replace(/[・、 ]$/, "");

    // Abort if our reading is only katakana (for non-Latin) or Latin 
    if ((!kanjiRegexes.latinOnly.test(wikidataKanji) && kanjiRegexes.katakanaOnly.test(kana))
        || kanjiRegexes.latinOnly.test(kana)) {
    	console.log("showKanji-dev.js: throwing away reading: " + kana);
    	getWiktionary();
    	return;
    }

    displayKana(kana);
    $("#kanjiInfo").addClass("kanjiInfo-jawiki");
}

// Adapted from:
//     http://ilog4.blogspot.com/2015/09/javascript-convert-full-width-and-half.html
//     https://stackoverflow.com/a/20488304/1995949
//     https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms
String.prototype.toHalfWidth = function() {
    var halfWidth = this.replace(/[\uff01-\uff5e]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) - 0xFEE0)});
    halfWidth = halfWidth.replace(/ /g, " ");
    return halfWidth;
};

// We use the English Wiktionary because it has more terms and better structure
function getWiktionary() {
	// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse
    $.ajax({
        url: "https://en.wiktionary.org/w/api.php",
        data: {
            action: "parse",
            format: "json",
            page: wikidataKanji,
            prop: "sections",
            origin: "*"
        },
        success: findJapaneseSection
    });
}

function findJapaneseSection(response) {
	if (response.error) {
		console.log("showKanji-dev.js: No Wiktionary item for " + wikidataKanji);
		return;
	}
	
    var sectionsCount = response.parse.sections.length;
    var sectionIndex;
    for (let i = 0; i < sectionsCount; i++) {
        var sectionHeader = response.parse.sections[i].line;
        if (sectionHeader == "Japanese") {
        	sectionIndex = response.parse.sections[i].index;
        	break;
        }
    }
    
    if (sectionIndex == null) {
    	console.log("showKanji-dev.js: Wiktionary entry doesn't have a section titled 'Japanese'");
    	return;
    }

	// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse
    $.ajax({
        url: "https://en.wiktionary.org/w/api.php",
        data: {
            action: "parse",
            format: "json",
            page: wikidataKanji,
            prop: "text",
            section: sectionIndex,
            origin: "*"
        },
        success: parseWiktionary
    });    
}

function parseWiktionary(response) {
	var html = response.parse.text["*"];
	var parsed = $($.parseHTML(html));

	// Wiktionary adds readings as furigana
	var headword = parsed.find(".headword:lang(ja)").first();
	var seeTable = parsed.find(".Jpan ruby").first();
	
	var kanji = "";
	var kana = "";
	if (headword.length) {
	    // Wiktionary already binds their kana, so we have to undo the process to get
	    // the constituent parts, at least with the current markup
	    var childNodes = headword[0].childNodes;
	    for (let i = 0; i < childNodes.length; i++) {
	    	if (childNodes[i].nodeName == "RUBY") {
	    		var ruby = $(childNodes[i]); // convert back to JQuery for convenience
	    		ruby.children("rp").remove();
	    		kana += ruby.children("rt").detach().text();
	    		kanji += ruby.text();
	    	} else if (childNodes[i].nodeType == 3) { // "#text"
			    kanji += childNodes[i].nodeValue;
			    kana += childNodes[i].nodeValue;
		    }
        }
        
        if (kanji != wikidataKanji) { return; }
	} else if (seeTable.length) {
		kanji = seeTable.children("rb").text();
		kana = seeTable.children("rt").text();
	} else {
		return;
	}

	if (kana) {
		displayKana(kana);
		$("#kanjiInfo").addClass("kanjiInfo-wiktionary");

		// Extra stuff just for fun
		var definition = headword.parent().siblings("ol").children("li").first().text();
		definition = definition.split('\n', 1)[0];
		definition = definition.replace(/\[[0-9]{1,2}\]/g, "");
		$("#kanjiInfo").prop("title", definition);
	}
}

function displayKana(kana) {
    $("#kanjiInfo ruby").append("<rt>" + kana + "</rt>");

    // Cleanup redundant furigana with another script
    var kanjiOnlyRe = /^[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]+$/;
    if (!kanjiOnlyRe.test(wikidataKanji)) {
       mw.loader.load( '//en.wikipedia.org/w/index.php?title=User:Opencooper/bindKana-dev.js&action=raw&ctype=text/javascript' );
    }
}

var wikidataId = mw.config.get( 'wgWikibaseItemId' );
var wikidataKanji;
var kanjiRegexes = {};
$(setup);