Jump to content

User:Opencooper/bindKana.js

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by Opencooper (talk | contribs) at 17:32, 25 August 2019 (add forward mode). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
// This script takes kanji with ruby text over it and removes repeated parts
// To install, add the following to your common.js:
//     importScript('User:Opencooper/bindKana.js'); // Backlink: [[User:Opencooper/bindKana.js]]

// References:
//             https://www.w3.org/International/articles/ruby/markup.en
//             https://w3c.github.io/i18n-drafts/articles/ruby/styling.en.html
// Fails on okurigana: https://en.wikipedia.org/wiki/I_Am_a_Cat
// Possible bug on: https://en.wikipedia.org/wiki/Douglas%E2%80%93Grumman_scandal
// Overcapturing: https://en.wikipedia.org/wiki/Kare_Kano
//                https://en.wikipedia.org/wiki/Nobunaga_no_Shinobi
//                https://en.wikipedia.org/wiki/Musashino-sen_no_Shimai
//                https://en.wikipedia.org/wiki/Hatfield%E2%80%93McCoy_feud
// Missing part of furigana: https://en.wikipedia.org/wiki/Tsuki_wa_Higashi_ni_Hi_wa_Nishi_ni
//                           https://en.wikipedia.org/wiki/Kawaii
// Katakana can't match extraneous hiragana: https://en.wikipedia.org/wiki/Gompertz_function

/* Test pages:
    https://en.wikipedia.org/wiki/Lear_on_the_Shore - mixed hiragana/katakana
    https://en.wikipedia.org/wiki/One_Cut_of_the_Dead - failed capture blocking later
    https://en.wikipedia.org/wiki/Dog%C3%97Police - partial block
    https://en.wikipedia.org/wiki/Otome_wa_Boku_ni_Koishiteru - partial capture
    https://en.wikipedia.org/wiki/Sacrificial_Princess_and_the_King_of_Beasts - fails
    https://en.wikipedia.org/wiki/Haou_Airen - special character
    https://en.wikipedia.org/wiki/Clamp_no_Kiseki - Latin isn't consumed
    https://en.wikipedia.org/wiki/Cape_St._George - interpunct is a space
*/   

function setup() {
    // If we're not reading an article, do nothing
    if (!(mw.config.get( 'wgAction' ) === 'view'
          && mw.config.get( 'wgIsArticle' )
          && !location.search.split('oldid=')[1]
          && !mw.config.get("wgIsMainPage"))) {
        return;
    }

    var header;
    if ($('#firstHeading').length) { // Vector
    	header = "#firstHeading";
    } else if ($('.page-heading').length) { // Minerva
    	header = ".page-heading";
    } else {
    	return;
    }

    var target = document.querySelector(header);
    var observer = new MutationObserver(function(mutationsList) {
        for (var mutation of mutationsList) {
            if (mutation.target.nodeName == "RUBY") {
                observer.disconnect();
                getKanjiInfo();
            }
        }
    });

    observer.observe(target, {childList: true, subtree: true});
}

function getKanjiInfo() {
    var kanji = $("#kanjiInfo ruby")[0].childNodes[0].nodeValue;
    var kana = $("#kanjiInfo rt").text();

    var bases = [kanji];
    var readings = [kana];
    logTable(readings, bases);

    var iterations = 0;
    var maxIterations = 25;
    var foundBindings = true;
    while (foundBindings && iterations != maxIterations) {
        iterations++;
        foundBindings = bindKana(bases, readings);

        if (foundBindings) {
            logTable(readings, bases);
        }
    }

    // Sanity check
    if (bases.length != readings.length) {
        throw new Error("Bases and readings arrays don't have same lengths.");
    }
    if (iterations == maxIterations - 1) {
        console.warn("bindKana.js: Encountered maximum iterations.");
        
        if (bases.length == 1) {
        	throw new Error("Encountered maximum iterations while furigana wasn't split once.");
        }
    }
    for (var i = 0; i < bases.length; i++) {
    	if (kanjiRe.test(bases[i]) && readings[i] === "") {
            throw new Error("Kanji base with no reading: " + bases[i]
                            + " at index " + i);
        }
    }

    // If any binding occured
    if (bases.length > 1) {
        displayBoundKana(bases, readings);
    }
}

function bindKana(bases, readings) {
	var regexes = [katakanaRe, alphanumRe, hiraganaRe, miscRe];
    var baseLength = bases.length;
    for (var i = 0; i < baseLength; i++) {
        if (readings[i] === "") {
            continue;
        }

        for (var regex of regexes) {
            searchBase(bases, readings, i, regex);

            if (bases.length != baseLength) {
                break;
            }
        }
    }

    if (bases.length != baseLength) {
        return true;
    } else {
        return false;
    }
}

function searchBase(bases, readings, index, re) {
	var baseLength = bases.length;
	var substring = bases[index].match(re);
    if (substring) {
        for (var j = 0; j < substring.length; j++) {
        	// Handle case where the furigana is just a hiragana version of the katakana
        	if (/^[ァ-ヴ]+$/.test(bases[index]) && bases[index] == readings[index].hiraganaToKatakana()) {
        		readings[index] = readings[index].hiraganaToKatakana();
        	}

            // Whitespace should be split searching forward
            if (substring !== " ") {
            	splitFuriganaReverse(bases, readings, index, substring[j]);
            } else {
            	splitFuriganaForward(bases, readings, index, substring[j]);
            }

            // We split on the substring
            if (bases.length != baseLength) {
            	// Splitting should result in [l|match|r] w/ ruby of [l|""|r]
            	if (bases.length != baseLength + 2) {
            		throw new Error("Splitting added more than two new parts.");
            	}

                return;
            }
        }
    }
}

String.prototype.hiraganaToKatakana = function() {
    return this.replace(/[\u3041-\u3096]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) + 0x0060)});
};

// We search for everything reverse because particles are suffixes
function splitFuriganaReversed(bases, readings, index, substring) {
	var baseReversed = reverseString(bases[index]);
	var readingReversed = reverseString(readings[index]);
	var substringReversed = reverseString(substring);

    var substringEscaped = mw.RegExp.escape(substringReversed);
    var substringRe = new RegExp(substringEscaped);
    // We match everything to left of substring, substring, and then right side
    var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

    // First make sure substring is in both the base and its reading
    if (substringRe.test(baseReversed) && substringRe.test(readingReversed)) {
        console.log("bindKana.js: string found in both "+ bases[index]
                    + " and " + readings[index] + ": `" + substring
                    + "` at index " + index);
        // Insert substring into base
        var baseSearch = baseReversed.match(substringSearch);
        var baseLeftSide = reverseString(baseSearch[3]);
        var baseRightSide = reverseString(baseSearch[1]);
        // Start at index, delete one element, and then insert the other parameters
        bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
        
        var readingSearch = readingReversed.match(substringSearch);
        var readingLeftSide = reverseString(readingSearch[3]);
        var readingRightSide = reverseString(readingSearch[1]);
        readings.splice(index, 1, readingLeftSide, "", readingRightSide);
    }
}

// TODO: Generalize this with reverse somehow
function splitFuriganaForward(bases, readings, index, substring) {
    var substringEscaped = mw.RegExp.escape(substring);
    var substringRe = new RegExp(substringEscaped);
    var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");

    if (substringRe.test(bases[index]) && substringRe.test(readings[index])) {
        console.log("bindKana.js: string found in both "+ bases[index]
                    + " and " + readings[index] + ": `" + substring
                    + "` at index " + index);
        var baseSearch = bases[index].match(substringSearch);
        var baseLeftSide = baseSearch[1];
        var baseRightSide = baseSearch[3];
        // Start at index, delete one element, and then insert the other parameters
        bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
        
        var readingSearch = readings[index].match(substringSearch);
        var readingLeftSide = readingSearch[1];
        var readingRightSide = readingSearch[3];
        readings.splice(index, 1, readingLeftSide, "", readingRightSide);
    }
}

function reverseString(str) {
    return str.split("").reverse().join("");
}

function displayBoundKana(bases, readings) {
    $("#kanjiInfo ruby").addClass("unbound");
    $(".unbound").css("display", "none");
    var fromWikidata = false;
    if ($("#kanjiInfo rt").hasClass("kanjiInfo-wikidata")) {
    	fromWikidata = true;
    }

    // Build new ruby element from the two bases and readings arrays
    var newKana = "<ruby class='bound'>";
    for (var i = 0; i < bases.length; i++) {
        newKana += "<rb>" + bases[i] + "</rb>";
        newKana += "<rt>" + readings[i] + "</rt>";
    }
    newKana += "</ruby>";

    $("#kanjiInfo").append(newKana);
    if (fromWikidata) {
    	$("#kanjiInfo rt").addClass("kanjiInfo-wikidata");
    }    

    $("#kanjiInfo").hover(
        function() {
        	$(".bound").hide();
        	$(".unbound").show();
        },
        function() {
        	$(".unbound").hide();
        	$(".bound").show();
        }
    );

    prettifyEnds();
}

function prettifyEnds() {
	// Exclude misc characters from base; for nicer formatting
    $("#kanjiInfo rb").each(function(){
        var baseText = $(this).text();

        // Rm empty ruby base and readings
        if (baseText === "") {
        	$(this).next().remove();
        	$(this).remove();
        	return;
        } else if (baseText === " ") {
        	return;
        }

        var initial = baseText[0];
        if (miscRe.test(initial)) {
        	console.log("bindKana.js: found misc initial: `" + initial + "`");
            var initialRemainder = baseText.slice(1);
            $(this).text(initialRemainder);
            $(this).before("<rb>" + initial + "</rb><rt></rt>");
        }

        // FIXME: Doesn't seem to work if an inital was also removed, such as on [[Tsurune]]
        baseText = $(this).text();
        var final = baseText.slice(-1);
        if (miscRe.test(final)) {
        	console.log("bindKana.js: found misc final: `" + final + "`");
            var len = baseText.length;
            var finalRemainder = baseText.slice(0, len-1);
            $(this).text(finalRemainder);
            $(this).next().after("<rb>" + final + "</rb><rt></rt>");
        }
    }); 
}

// The table logging is done asynchronously, so we make a deep copy 
function logTable(readings, bases) {
	var copy = {readings: [], bases: []};
	copy.readings = readings.slice(0);
	copy.bases = bases.slice(0);
	console.table(copy);
}

var kanjiRe = /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]/;
// var kanjiRe = /[一-龯]+/g;
var hiraganaRe = /[ぁ-ゔ]+/g;
var katakanaRe = /[ァ-ヴー]+/g;
var alphanumRe = /[A-Za-z0-9]+/g;
var miscRe = /[- !.?・、「」×〜&/]/g; 
$(setup);