Jump to content

User:Opencooper/showKanji.js

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// This script shows, if found, the kanji and kana for an article
// It then calls another script, bindKana.js, to clean up the display of ruby
// For configuration, please see the documentation

// License: CC0

function setup() {
    // If we're not reading an article, do nothing
    if (!(mw.config.get( 'wgAction' ) === 'view'
          && mw.config.get( 'wgIsArticle' )
          && !location.search.split('oldid=')[1]
          && !mw.config.get("wgIsMainPage")
          && mw.config.get("wgContentLanguage") !== "ja")) {
        return;
    }

    // Assuming that if there's no wikidata, there're no 1:1 interlanguage links,
    // and we don't want cases where a page links to a subsection of a jawiki
    // article
    if (wikidataId === null) {
        return;
    }

    // Placeholder so other elements don't push it down later
    var header;
    if ($('#firstHeading').length) { // Vector
    	header = $('#firstHeading');
    } else if ($('.page-heading').length) { // Minerva
    	header =  $('.page-heading');
    } else {
    	console.error("showKanji.js: Couldn't find a page heading. This skin ("
    	              + mw.config.get( 'skin' ) + ") might not be supported.");
    	return;
    }
    header.append("<div id='kanjiInfo' lang='ja' dir='ltr'></div>");

    // Get the Japanese label from wikidata
    // API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities
    $.ajax({
        url: "https://www.wikidata.org/w/api.php",
        data: {
            action: "wbgetentities",
            ids: wikidataId,
            props: "labels",
            languages: "ja",
            format: "json",
            origin: "*"
        },
        success: parseJaLabel
    });
}

function parseJaLabel(response) {
    var wikidataInfo = response.entities[wikidataId];
    var jaLabel;
    if (!jQuery.isEmptyObject(wikidataInfo.labels.ja)) {
        jaLabel = wikidataInfo.labels.ja.value;
    }

    if (jaLabel) {
    	jaLabel = jaLabel.toHalfWidth();
        buildRegexes(jaLabel);
        displayKanji(jaLabel);
    } else {
        return;
    }

    // If the japanese title is not just only kana, get the reading
    if (!kanjiRegexes.kanaOnly.test(jaLabel)) {
        requestKana();
    }
}

function buildRegexes(kanji) {
    // Strip $kanji of all kanji and kana, adding whatever is left to the regex
    var reKanjiKana = /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴー-]/g;
    var kanjiStripped = kanji.replace(reKanjiKana, "");
    kanjiStripped += " ";
    // Need to add hyphen escaped since it has special behavior in regex classes
    kanjiStripped += "\\-";
    var kanjiAuxillary = kanjiStripped.replace(/\w/g, "");

    kanjiRegexes.latinOnly = /^[A-Za-z0-9\-.?!/,:;@#$%&+=*'"・ ]+$/;
    kanjiRegexes.kanaOnly = new RegExp("^[ぁ-ゔァ-ヴー" + kanjiAuxillary + "]+$");
    kanjiRegexes.hiraganaOnly = new RegExp("^[ぁ-ゔーA-Za-z" + kanjiAuxillary + "]+$");
    kanjiRegexes.katakanaOnly = new RegExp("^[ァ-ヴーA-Za-z" + kanjiAuxillary + "]+$");

    // Add midpoint for Latin in titles
    if (/\w/.test(kanji)) { kanjiStripped += "・"; }

    var leadReBase = "([ぁ-ゔァ-ヴー" + kanjiStripped + "]+)";
    var kanjiEscaped = mw.util.escapeRegExp(kanji);
    // Account for spaces, but ignore backslash and other misc characters
    var reKanjiKanaLatin = /([\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴーA-Za-z0-9])/g;
    var kanjiSpaced = kanjiEscaped.replace(/ /g, " ?");
    kanjiSpaced = kanjiSpaced.replace(reKanjiKanaLatin, "$1 ?");

    // Add kanji to regex to make sure we're not getting the reading of some
    // other term
    kanjiRegexes.lead = new RegExp(kanjiSpaced + "[^(\n)]*?\\(" + leadReBase, "i"); // brittle
}

function displayKanji(kanji) {
	wikidataKanji = kanji;
    $('#kanjiInfo').append("<ruby>" + kanji + "</ruby>");

    // Add some classes so users can choose to not display for example
    // katakana-only kanji in their CSS
    if (kanjiRegexes.latinOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-latin-only");
        $("#kanjiInfo").prop("title", "Japanese title in Latin script");
        $("#kanjiInfo").css("display", "none");
    } else if (kanjiRegexes.hiraganaOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-hiragana-only");
        $("#kanjiInfo").prop("title", "Japanese title in hiragana");
    } else if (kanjiRegexes.katakanaOnly.test(kanji)) {
        $("#kanjiInfo").addClass("kanjiInfo-katakana-only");
        $("#kanjiInfo").prop("title", "Japanese title in katakana");
    } else {
    	$("#kanjiInfo").prop("title", "Japanese title in kanji");
    }
}

function requestKana() {
    // API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetclaims
    // We have to wholesale get all the claims instead of just one because the
    // kana might be present as a qualifier to another claim
    $.ajax({
        url: "https://www.wikidata.org/w/api.php",
        data: {
            action: "wbgetclaims",
            entity: wikidataId,
            format: "json",
            origin: "*"
        },
        success: parseKanaClaim
    });
}

function parseKanaClaim(response) {
    var kana;
    var properties = {
    	                 title: "P1476",
                         nativeLabel: "P1705",
                         officialName: "P1448",
    	                 nameInNativeLanguage: "P1559"
                     };
    var nameInKana = "P1814";
    
    // Try getting nameInKana as a qualifier to some properties                  
    for (var prop in properties) {
    	var pnum = properties[prop];
    	
    	if (response.claims[pnum]) {
            var kanji = response.claims[pnum][0].mainsnak.datavalue.value.text;
            if (kanji.replace(/ /g, "") == wikidataKanji.replace(/ /g, "")
                && response.claims[pnum][0].qualifiers
                && response.claims[pnum][0].qualifiers[nameInKana]) {
                kana = response.claims[pnum][0].qualifiers[nameInKana][0].datavalue.value;
    	        break;
            }
    	}
    }

    // Try getting nameInKana as a general claim
    if (!kana && response.claims[nameInKana]) {
    	prop = "nameInKana";
        kana = response.claims[nameInKana][0].mainsnak.datavalue.value;
    }
    
    // We couldn't find nameInKana
    if (!kana) {
        getInterlanguage();
        return;
    }

    kana = kana.toHalfWidth();
    displayKana(kana);
    $("#kanjiInfo").addClass("kanjiInfo-wikidata");
    $("#kanjiInfo").addClass("kanjiInfo-wikidata-" + prop);
}

function getInterlanguage() {
    var apiUrl = location.origin + "/w/api.php";
    // Documentation: https://en.wikipedia.org/w/api.php?action=help&modules=query%2Blanglinks
    $.ajax({
        url: apiUrl,
        data: {
            action: "query",
            format: "json",
            prop: "langlinks",
            lllang: "ja",
            titles: mw.config.get( 'wgTitle' )
        },
        success: function(response) {
        	var pageId = mw.config.get( 'wgArticleId' );
        	var page = response.query.pages[pageId];
            var langlinks = page ? page.langlinks : undefined;
        	var jaLabel;
        	if (langlinks) {
        	    jaLabel = langlinks[0]["*"];
        	    jaLabel = jaLabel.replace(/(.*)#.*/, "$1"); // rm anchors
        	} else {
        		getWiktionary();
        		return;
        	}
        	scrapeKana(jaLabel);
        }
    });
}

function scrapeKana(jaLabel) {
    // Get jawiki article's lead wikitext
    // API docs: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bextracts
    $.ajax({
        url: "https://ja.wikipedia.org/w/api.php",
        data: {
            action: "query",
            prop: "extracts",
            format: "json",
            redirects: true,
            exintro: true,
            exsentences: 2,
            exlimit: 1,
            explaintext: true,
            titles: jaLabel,
            origin: "*"
        },
        success: getFirstSentence
    });
}

function getFirstSentence(response) {
    var responsePart = response.query.pages;
    // Have to split parsing into two parts since jawiki pageid is unknown
    var pageId = Object.keys(responsePart)[0];
    var introText = responsePart[pageId].extract;

    if (!introText) {
        console.error("showKanji.js: TextExtracts failed to get a lead for the Japanese article.");
        getWiktionary();
        return;
    }

    var wikitext = introText.toHalfWidth();

    var kana;
    var kanaSearch = wikitext.match(kanjiRegexes.lead);
    if (kanaSearch && kanaSearch.length == 2) {
        kana = kanaSearch[1];
    } else {
    	getWiktionary();
        return;
    }

    // Rm trailing characters
    kana = kana.replace(/[・、 ]$/, "");

    // Abort if our reading is only katakana (for non-Latin) or Latin 
    if ((!kanjiRegexes.latinOnly.test(wikidataKanji) && kanjiRegexes.katakanaOnly.test(kana))
        || kanjiRegexes.latinOnly.test(kana)) {
    	getWiktionary();
    	return;
    }

    displayKana(kana);
    $("#kanjiInfo").addClass("kanjiInfo-jawiki");
}

// Adapted from:
//     http://ilog4.blogspot.com/2015/09/javascript-convert-full-width-and-half.html
//     https://stackoverflow.com/a/20488304/1995949
//     https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms
String.prototype.toHalfWidth = function() {
    var halfWidth = this.replace(/[\uff01-\uff5e]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) - 0xFEE0)});
    halfWidth = halfWidth.replace(/ /g, " ");
    return halfWidth;
};

// We use the English Wiktionary because it has more terms and better structure
function getWiktionary() {
	// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse
    $.ajax({
        url: "https://en.wiktionary.org/w/api.php",
        data: {
            action: "parse",
            format: "json",
            page: wikidataKanji,
            prop: "sections",
            origin: "*"
        },
        success: findJapaneseSection
    });
}

function findJapaneseSection(response) {
	if (response.error) {
		return;
	}
	
    var sectionsCount = response.parse.sections.length;
    var sectionIndex;
    for (let i = 0; i < sectionsCount; i++) {
        var sectionHeader = response.parse.sections[i].line;
        if (sectionHeader == "Japanese") {
        	sectionIndex = response.parse.sections[i].index;
        	break;
        }
    }
    
    if (sectionIndex == null) {
    	return;
    }

	// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse
    $.ajax({
        url: "https://en.wiktionary.org/w/api.php",
        data: {
            action: "parse",
            format: "json",
            page: wikidataKanji,
            prop: "text",
            section: sectionIndex,
            origin: "*"
        },
        success: parseWiktionary
    });    
}

function parseWiktionary(response) {
	var html = response.parse.text["*"];
	var parsed = $($.parseHTML(html));

	// Wiktionary adds readings as furigana
	var headword = parsed.find(".headword:lang(ja)").first();
	var seeTable = parsed.find(".Jpan ruby").first();
	
	var kanji = "";
	var kana = "";
	if (headword.length) {
	    // Wiktionary already binds their kana, so we have to undo the process to get
	    // the constituent parts, at least with the current markup
	    var childNodes = headword[0].childNodes;
	    for (let i = 0; i < childNodes.length; i++) {
	    	if (childNodes[i].nodeName == "RUBY") {
	    		var ruby = $(childNodes[i]); // convert back to JQuery for convenience
	    		ruby.children("rp").remove();
	    		kana += ruby.children("rt").detach().text();
	    		kanji += ruby.text();
	    	} else if (childNodes[i].nodeType == 3) { // "#text"
			    kanji += childNodes[i].nodeValue;
			    kana += childNodes[i].nodeValue;
		    }
        }

        if (kanji != wikidataKanji) { return; }
	} else if (seeTable.length) {
		kanji = seeTable.children("rb").text();
		kana = seeTable.children("rt").text();
	} else {
		return;
	}

	if (kana) {
		displayKana(kana);
		$("#kanjiInfo").addClass("kanjiInfo-wiktionary");
	}
}

function displayKana(kana) {
    $("#kanjiInfo ruby").append("<rt>" + kana + "</rt>");

    // Cleanup redundant furigana with another script
    var kanjiOnlyRe = /^[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]+$/;
    if (!kanjiOnlyRe.test(wikidataKanji)) {
       mw.loader.load( '//en.wikipedia.org/w/index.php?title=User:Opencooper/bindKana.js&action=raw&ctype=text/javascript' );
    }
}

var wikidataId = mw.config.get( 'wgWikibaseItemId' );
var wikidataKanji;
var kanjiRegexes = {};
$(setup);