User:Opencooper/bindKana.js

Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump.
This code will be executed when previewing this page.
Documentation for this user script can be added at User:Opencooper/bindKana.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// This script takes kanji with ruby text over it and removes repeated parts
// To install, add the following to your common.js:
//     importScript('User:Opencooper/bindKana.js'); // Backlink: [[User:Opencooper/bindKana.js]]

// The basic algorithm searches for *continuous* hiragana/katakana/latin/punctuation
// strings that are in both the base and reading, and splits on these. This does
// not take into account any lexical information (so it doesn't know anything about
// particles or individual kanji readings). It can also fail for more complicated
// cases, but the script should be able to abort for these (maybe in the future we can
// continue and just ignore that specific base and substring).

// References:
//             https://www.w3.org/International/articles/ruby/markup.en
//             https://w3c.github.io/i18n-drafts/articles/ruby/styling.en.html
//             https://www.w3.org/TR/css-ruby-1/#break-between
// Fails on okurigana: https://en.wikipedia.org/wiki/I_Am_a_Cat
// Possible bug on: https://en.wikipedia.org/wiki/Douglas%E2%80%93Grumman_scandal
// Overcapturing: https://en.wikipedia.org/wiki/Kare_Kano
//                https://en.wikipedia.org/wiki/Nobunaga_no_Shinobi
//                https://en.wikipedia.org/wiki/Musashino-sen_no_Shimai
//                https://en.wikipedia.org/wiki/Hatfield%E2%80%93McCoy_feud
// Missing part of furigana: https://en.wikipedia.org/wiki/Tsuki_wa_Higashi_ni_Hi_wa_Nishi_ni
//                           https://en.wikipedia.org/wiki/Kawaii
// Katakana can't match extraneous hiragana: https://en.wikipedia.org/wiki/Gompertz_function

/* Test pages:
    https://en.wikipedia.org/wiki/Lear_on_the_Shore - mixed hiragana/katakana
    https://en.wikipedia.org/wiki/One_Cut_of_the_Dead - failed capture blocking later
    https://en.wikipedia.org/wiki/Dog%C3%97Police - partial block
    https://en.wikipedia.org/wiki/Otome_wa_Boku_ni_Koishiteru - partial capture
    https://en.wikipedia.org/wiki/Sacrificial_Princess_and_the_King_of_Beasts - fails
    https://en.wikipedia.org/wiki/Clamp_no_Kiseki - Latin isn't consumed
    https://en.wikipedia.org/wiki/Cape_St._George - interpunct is a space
    https://en.m.wikipedia.org/wiki/Chūshingura:_Hana_no_Maki,_Yuki_no_Maki - whitespace ignored on mobile
*/   

function setup() {
    // If we're not reading an article, do nothing
    if (!(mw.config.get( 'wgAction' ) === 'view'
          && mw.config.get( 'wgIsArticle' )
          && !location.search.split('oldid=')[1]
          && !mw.config.get("wgIsMainPage"))) {
        return;
    }

    var header;
    if ($('#firstHeading').length) { // Vector
    	header = "#firstHeading";
    } else if ($('.page-heading').length) { // Minerva
    	header = ".page-heading";
    } else {
    	return;
    }

    var target = document.querySelector(header);
    var observer = new MutationObserver(function(mutationsList) {
        for (var mutation of mutationsList) {
            if (mutation.target.nodeName == "RUBY") {
                observer.disconnect();
                getKanjiInfo();
            }
        }
    });

    observer.observe(target, {childList: true, subtree: true});
}

function getKanjiInfo() {
	// Don't run if the kanji or the ruby is hidden
	if ($("#kanjiInfo").css("display") == "none" || $("#kanjiInfo rt").css("display") == "none") {
		return;
	}

    var kanji = $("#kanjiInfo ruby")[0].childNodes[0].nodeValue;
    var kana = $("#kanjiInfo rt").text();

    var bases = [kanji];
    var readings = [kana];
    // logTable(readings, bases);

    var iterations = 0;
    var maxIterations = 25;
    var foundBindings = true;
    while (foundBindings && iterations != maxIterations) {
        iterations++;
        foundBindings = bindKana(bases, readings);

        // if (foundBindings) {
        //     logTable(readings, bases);
        //     console.log("bindKana.js: readings: `" + readings + "`");
        //     console.log("bindKana.js: bases: `" + bases + "`\n");
        // }
    }

    // Sanity check
    if (bases.length != readings.length) {
        throw new Error("bindKana.js: Bases and readings arrays don't have same lengths.");
    }
    if (iterations == maxIterations - 1) {
        console.warn("bindKana.js: Encountered maximum iterations.");
        
        if (bases.length == 1) {
        	throw new Error("bindKana.js: Encountered maximum iterations while furigana wasn't split once.");
        }
    }

    // If any binding occured
    if (bases.length > 1) {
        displayBoundKana(bases, readings);
    }
}

function bindKana(bases, readings) {
	var regexes = [kanaRegexes.katakanaRe, kanaRegexes.alphanumRe,
	               kanaRegexes.hiraganaRe, kanaRegexes.miscRe];
    var baseLength = bases.length;
    for (var i = 0; i < baseLength; i++) {
        if (readings[i] === "") {
            continue;
        }

        for (var regex of regexes) {
            searchBase(bases, readings, i, regex);

            if (bases.length != baseLength) {
                break;
            }
        }
    }

    if (bases.length != baseLength) {
    	// Make sure splitting didn't mess up the bindings
        for (var j = 0; j < bases.length; j++) {
         if (kanaRegexes.kanjiRe.test(bases[j]) && readings[j] === "") {
                throw new Error("bindKana.js: Kanji base with no reading: `"
                                + bases[j] + "` at index " + j);
            } else if (bases[j]  === "" && readings[j]) {
                throw new Error("bindKana.js: Blank base with reading: `"
                                + readings[j] + "` at index " + j);
            }
        }

        return true;
    } else {
        return false;
    }
}

function searchBase(bases, readings, index, re) {
	var baseLength = bases.length;
	var substring = bases[index].match(re);
    if (substring) {
        for (var j = 0; j < substring.length; j++) {
        	// Handle case where the furigana is just a hiragana version of the katakana
        	// Only works if whole thing is split along the reading
        	if (/^[ァ-ヴ]+$/.test(bases[index]) && bases[index] == readings[index].hiraganaToKatakana()) {
        		readings[index] = readings[index].hiraganaToKatakana();
        	}

            // Misc stuff like whitespace should be split searching forward
            if (re !== kanaRegexes.miscRe) {
            	splitFuriganaReverse(bases, readings, index, substring[j]);
            } else {
            	splitFuriganaForward(bases, readings, index, substring[j]);
            }

            // We split on the substring
            if (bases.length != baseLength) {
            	// Splitting should result in [l|match|r] w/ ruby of [l|""|r]
            	if (bases.length != baseLength + 2) {
            		throw new Error("bindKana.js: Splitting added more than two new parts.");
            	}

                return;
            }
        }
    }
}

String.prototype.hiraganaToKatakana = function() {
    return this.replace(/[\u3041-\u3096]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) + 0x0060)});
};

// We search for everything reverses because particles are suffixes
function splitFuriganaReverse(bases, readings, index, substring) {
	var baseReversed = reverseString(bases[index]);
	var readingReversed = reverseString(readings[index]);
	var substringReversed = reverseString(substring);

    var substringEscaped = mw.RegExp.escape(substringReversed);
    var substringRe = new RegExp(substringEscaped);
    // We match everything to left of substring, substring, and then right side
    var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

    // First make sure substring is in both the base and its reading
    if (substringRe.test(baseReversed) && substringRe.test(readingReversed)) {
        // console.log("bindKana.js: string found in both `" + readings[index]
        //             + "` and `" + bases[index] + "`: `" + substring
        //             + "` at index " + index);
        // Insert substring into base
        var baseSearch = baseReversed.match(substringSearch);
        var baseLeftSide = reverseString(baseSearch[3]);
        var baseRightSide = reverseString(baseSearch[1]);
        // Start at index, delete one element, and then insert the other parameters
        bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
        
        var readingSearch = readingReversed.match(substringSearch);
        // AaBbCc -> cC | bB | aA
        var readingLeftSide = reverseString(readingSearch[3]);
        var readingRightSide = reverseString(readingSearch[1]);
        readings.splice(index, 1, readingLeftSide, "", readingRightSide);
    }
}

function reverseString(str) {
    return str.split("").reverse().join("");
}

// TODO: Generalize this with reverse somehow
function splitFuriganaForward(bases, readings, index, substring) {
    var substringEscaped = mw.RegExp.escape(substring);
    var substringRe = new RegExp(substringEscaped);
    var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

    if (substringRe.test(bases[index]) && substringRe.test(readings[index])) {
        // console.log("bindKana.js: string found in both "+ bases[index]
        //             + " and " + readings[index] + ": `" + substring
        //             + "` at index " + index);
        var baseSearch = bases[index].match(substringSearch);
        var baseLeftSide = baseSearch[1];
        var baseRightSide = baseSearch[3];
        // Start at index, delete one element, and then insert the other parameters
        bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
        
        var readingSearch = readings[index].match(substringSearch);
        var readingLeftSide = readingSearch[1];
        var readingRightSide = readingSearch[3];
        readings.splice(index, 1, readingLeftSide, "", readingRightSide);
    }
}

function displayBoundKana(bases, readings) {
    $("#kanjiInfo ruby").addClass("unbound");
    $(".unbound").css("display", "none");
    var fromWikidata = false;
    if ($("#kanjiInfo rt").hasClass("kanjiInfo-wikidata")) {
    	fromWikidata = true;
    }

    // Build new ruby element from the two bases and readings arrays
    var newKana = "<ruby class='bound'>";
    for (var i = 0; i < bases.length; i++) {
        newKana += "<rb>" + bases[i] + "</rb>";
        newKana += "<rt>" + readings[i] + "</rt>";
    }
    newKana += "</ruby>";

    $("#kanjiInfo").append(newKana);
    if (fromWikidata) {
    	$("#kanjiInfo rt").addClass("kanjiInfo-wikidata");
    }    

    $("#kanjiInfo").hover(
        function() {
        	$(".bound").hide();
        	$(".unbound").show();
        },
        function() {
        	$(".unbound").hide();
        	$(".bound").show();
        }
    );

    prettifyEnds();
}

function prettifyEnds() {
	// Exclude misc characters from base; for nicer formatting
    $("#kanjiInfo rb").each(function(){
        var baseText = $(this).text();

        // Rm empty ruby base and readings
        if (baseText === "") {
        	$(this).next().remove();
        	$(this).remove();
        	return;
        } else if (baseText === " ") {
        	return;
        }

        var initial = baseText[0];
        kanaRegexes.miscRe.lastIndex = 0; // reset regex
        if (kanaRegexes.miscRe.test(initial)) {
        	// console.log("bindKana.js: found misc initial: `" + initial + "`");
            var initialRemainder = baseText.slice(1);
            $(this).text(initialRemainder);
            $(this).before("<rb>" + initial + "</rb><rt></rt>");
        }

        baseText = $(this).text();
        kanaRegexes.miscRe.lastIndex = 0;
        var final = baseText.slice(-1);
        if (kanaRegexes.miscRe.test(final)) {
        	// console.log("bindKana.js: found misc final: `" + final + "`");
            var len = baseText.length;
            var finalRemainder = baseText.slice(0, len-1);
            $(this).text(finalRemainder);
            $(this).next().after("<rb>" + final + "</rb><rt></rt>");
        }
    }); 
}

// The table logging is done asynchronously, so we make a deep copy 
function logTable(readings, bases) {
	var copy = {readings: [], bases: []};
	copy.readings = readings.slice(0);
	copy.bases = bases.slice(0);
	console.table(copy);
}

var kanaRegexes = {
    kanjiRe: /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]/,
    // kanjiRe: /[一-龯]+/g,
    hiraganaRe: /[ぁ-ゔ]+/g,
    katakanaRe: /[ァ-ヴー]+/g,
    alphanumRe: /[A-Za-z0-9]+/g,
    miscRe: /[- !.?・、「」×〜&/]/g
}

$(setup);