User:Opencooper/bindKana.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
![]() | Documentation for this user script can be added at User:Opencooper/bindKana. |
// This script takes kanji with ruby text over it and removes repeated parts
// To install, add the following to your common.js:
// importScript('User:Opencooper/bindKana.js'); // Backlink: [[User:Opencooper/bindKana.js]]
// The basic algorithm searches for *continuous* hiragana/katakana/latin/punctuation
// strings that are in both the base and reading, and splits on these. This does
// not take into account any lexical information (so it doesn't know anything about
// particles or individual kanji readings). It can also fail for more complicated
// cases, but the script should be able to abort for these (maybe in the future we can
// continue and just ignore that specific base and substring).
// References:
// https://www.w3.org/International/articles/ruby/markup.en
// https://w3c.github.io/i18n-drafts/articles/ruby/styling.en.html
// https://www.w3.org/TR/css-ruby-1/#break-between
// Fails on okurigana: https://en.wikipedia.org/wiki/I_Am_a_Cat
// Possible bug on: https://en.wikipedia.org/wiki/Douglas%E2%80%93Grumman_scandal
// Overcapturing: https://en.wikipedia.org/wiki/Kare_Kano
// https://en.wikipedia.org/wiki/Nobunaga_no_Shinobi
// https://en.wikipedia.org/wiki/Musashino-sen_no_Shimai
// https://en.wikipedia.org/wiki/Hatfield%E2%80%93McCoy_feud
// Missing part of furigana: https://en.wikipedia.org/wiki/Tsuki_wa_Higashi_ni_Hi_wa_Nishi_ni
// https://en.wikipedia.org/wiki/Kawaii
// Katakana can't match extraneous hiragana: https://en.wikipedia.org/wiki/Gompertz_function
/* Test pages:
https://en.wikipedia.org/wiki/Lear_on_the_Shore - mixed hiragana/katakana
https://en.wikipedia.org/wiki/One_Cut_of_the_Dead - failed capture blocking later
https://en.wikipedia.org/wiki/Dog%C3%97Police - partial block
https://en.wikipedia.org/wiki/Otome_wa_Boku_ni_Koishiteru - partial capture
https://en.wikipedia.org/wiki/Sacrificial_Princess_and_the_King_of_Beasts - fails
https://en.wikipedia.org/wiki/Clamp_no_Kiseki - Latin isn't consumed
https://en.wikipedia.org/wiki/Cape_St._George - interpunct is a space
https://en.m.wikipedia.org/wiki/Chūshingura:_Hana_no_Maki,_Yuki_no_Maki - whitespace ignored on mobile
*/
function setup() {
// If we're not reading an article, do nothing
if (!(mw.config.get( 'wgAction' ) === 'view'
&& mw.config.get( 'wgIsArticle' )
&& !location.search.split('oldid=')[1]
&& !mw.config.get("wgIsMainPage"))) {
return;
}
var header;
if ($('#firstHeading').length) { // Vector
header = "#firstHeading";
} else if ($('.page-heading').length) { // Minerva
header = ".page-heading";
} else {
return;
}
var target = document.querySelector(header);
var observer = new MutationObserver(function(mutationsList) {
for (var mutation of mutationsList) {
if (mutation.target.nodeName == "RUBY") {
observer.disconnect();
getKanjiInfo();
}
}
});
observer.observe(target, {childList: true, subtree: true});
}
function getKanjiInfo() {
// Don't run if the kanji or the ruby is hidden
if ($("#kanjiInfo").css("display") == "none" || $("#kanjiInfo rt").css("display") == "none") {
return;
}
var kanji = $("#kanjiInfo ruby")[0].childNodes[0].nodeValue;
var kana = $("#kanjiInfo rt").text();
var bases = [kanji];
var readings = [kana];
// logTable(readings, bases);
var iterations = 0;
var maxIterations = 25;
var foundBindings = true;
while (foundBindings && iterations != maxIterations) {
iterations++;
foundBindings = bindKana(bases, readings);
// if (foundBindings) {
// logTable(readings, bases);
// console.log("bindKana.js: readings: `" + readings + "`");
// console.log("bindKana.js: bases: `" + bases + "`\n");
// }
}
// Sanity check
if (bases.length != readings.length) {
throw new Error("bindKana.js: Bases and readings arrays don't have same lengths.");
}
if (iterations == maxIterations - 1) {
console.warn("bindKana.js: Encountered maximum iterations.");
if (bases.length == 1) {
throw new Error("bindKana.js: Encountered maximum iterations while furigana wasn't split once.");
}
}
// If any binding occured
if (bases.length > 1) {
displayBoundKana(bases, readings);
}
}
function bindKana(bases, readings) {
var regexes = [kanaRegexes.katakanaRe, kanaRegexes.alphanumRe,
kanaRegexes.hiraganaRe, kanaRegexes.miscRe];
var baseLength = bases.length;
for (var i = 0; i < baseLength; i++) {
if (readings[i] === "") {
continue;
}
for (var regex of regexes) {
searchBase(bases, readings, i, regex);
if (bases.length != baseLength) {
break;
}
}
}
if (bases.length != baseLength) {
// Make sure splitting didn't mess up the bindings
for (var j = 0; j < bases.length; j++) {
if (kanaRegexes.kanjiRe.test(bases[j]) && readings[j] === "") {
throw new Error("bindKana.js: Kanji base with no reading: `"
+ bases[j] + "` at index " + j);
} else if (bases[j] === "" && readings[j]) {
throw new Error("bindKana.js: Blank base with reading: `"
+ readings[j] + "` at index " + j);
}
}
return true;
} else {
return false;
}
}
function searchBase(bases, readings, index, re) {
var baseLength = bases.length;
var substring = bases[index].match(re);
if (substring) {
for (var j = 0; j < substring.length; j++) {
// Handle case where the furigana is just a hiragana version of the katakana
// Only works if whole thing is split along the reading
if (/^[ァ-ヴ]+$/.test(bases[index]) && bases[index] == readings[index].hiraganaToKatakana()) {
readings[index] = readings[index].hiraganaToKatakana();
}
// Misc stuff like whitespace should be split searching forward
if (re !== kanaRegexes.miscRe) {
splitFuriganaReverse(bases, readings, index, substring[j]);
} else {
splitFuriganaForward(bases, readings, index, substring[j]);
}
// We split on the substring
if (bases.length != baseLength) {
// Splitting should result in [l|match|r] w/ ruby of [l|""|r]
if (bases.length != baseLength + 2) {
throw new Error("bindKana.js: Splitting added more than two new parts.");
}
return;
}
}
}
}
String.prototype.hiraganaToKatakana = function() {
return this.replace(/[\u3041-\u3096]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) + 0x0060)});
};
// We search for everything reverses because particles are suffixes
function splitFuriganaReverse(bases, readings, index, substring) {
var baseReversed = reverseString(bases[index]);
var readingReversed = reverseString(readings[index]);
var substringReversed = reverseString(substring);
var substringEscaped = mw.RegExp.escape(substringReversed);
var substringRe = new RegExp(substringEscaped);
// We match everything to left of substring, substring, and then right side
var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");
// First make sure substring is in both the base and its reading
if (substringRe.test(baseReversed) && substringRe.test(readingReversed)) {
// console.log("bindKana.js: string found in both `" + readings[index]
// + "` and `" + bases[index] + "`: `" + substring
// + "` at index " + index);
// Insert substring into base
var baseSearch = baseReversed.match(substringSearch);
var baseLeftSide = reverseString(baseSearch[3]);
var baseRightSide = reverseString(baseSearch[1]);
// Start at index, delete one element, and then insert the other parameters
bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
var readingSearch = readingReversed.match(substringSearch);
// AaBbCc -> cC | bB | aA
var readingLeftSide = reverseString(readingSearch[3]);
var readingRightSide = reverseString(readingSearch[1]);
readings.splice(index, 1, readingLeftSide, "", readingRightSide);
}
}
function reverseString(str) {
return str.split("").reverse().join("");
}
// TODO: Generalize this with reverse somehow
function splitFuriganaForward(bases, readings, index, substring) {
var substringEscaped = mw.RegExp.escape(substring);
var substringRe = new RegExp(substringEscaped);
var substringSearch = new RegExp("(.*?)(" + substringEscaped + ")(.*)");
if (substringRe.test(bases[index]) && substringRe.test(readings[index])) {
// console.log("bindKana.js: string found in both "+ bases[index]
// + " and " + readings[index] + ": `" + substring
// + "` at index " + index);
var baseSearch = bases[index].match(substringSearch);
var baseLeftSide = baseSearch[1];
var baseRightSide = baseSearch[3];
// Start at index, delete one element, and then insert the other parameters
bases.splice(index, 1, baseLeftSide, substring, baseRightSide);
var readingSearch = readings[index].match(substringSearch);
var readingLeftSide = readingSearch[1];
var readingRightSide = readingSearch[3];
readings.splice(index, 1, readingLeftSide, "", readingRightSide);
}
}
function displayBoundKana(bases, readings) {
$("#kanjiInfo ruby").addClass("unbound");
$(".unbound").css("display", "none");
var fromWikidata = false;
if ($("#kanjiInfo rt").hasClass("kanjiInfo-wikidata")) {
fromWikidata = true;
}
// Build new ruby element from the two bases and readings arrays
var newKana = "<ruby class='bound'>";
for (var i = 0; i < bases.length; i++) {
newKana += "<rb>" + bases[i] + "</rb>";
newKana += "<rt>" + readings[i] + "</rt>";
}
newKana += "</ruby>";
$("#kanjiInfo").append(newKana);
if (fromWikidata) {
$("#kanjiInfo rt").addClass("kanjiInfo-wikidata");
}
$("#kanjiInfo").hover(
function() {
$(".bound").hide();
$(".unbound").show();
},
function() {
$(".unbound").hide();
$(".bound").show();
}
);
prettifyEnds();
}
function prettifyEnds() {
// Exclude misc characters from base; for nicer formatting
$("#kanjiInfo rb").each(function(){
var baseText = $(this).text();
// Rm empty ruby base and readings
if (baseText === "") {
$(this).next().remove();
$(this).remove();
return;
} else if (baseText === " ") {
return;
}
var initial = baseText[0];
kanaRegexes.miscRe.lastIndex = 0; // reset regex
if (kanaRegexes.miscRe.test(initial)) {
// console.log("bindKana.js: found misc initial: `" + initial + "`");
var initialRemainder = baseText.slice(1);
$(this).text(initialRemainder);
$(this).before("<rb>" + initial + "</rb><rt></rt>");
}
baseText = $(this).text();
kanaRegexes.miscRe.lastIndex = 0;
var final = baseText.slice(-1);
if (kanaRegexes.miscRe.test(final)) {
// console.log("bindKana.js: found misc final: `" + final + "`");
var len = baseText.length;
var finalRemainder = baseText.slice(0, len-1);
$(this).text(finalRemainder);
$(this).next().after("<rb>" + final + "</rb><rt></rt>");
}
});
}
// The table logging is done asynchronously, so we make a deep copy
function logTable(readings, bases) {
var copy = {readings: [], bases: []};
copy.readings = readings.slice(0);
copy.bases = bases.slice(0);
console.table(copy);
}
var kanaRegexes = {
kanjiRe: /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]/,
// kanjiRe: /[一-龯]+/g,
hiraganaRe: /[ぁ-ゔ]+/g,
katakanaRe: /[ァ-ヴー]+/g,
alphanumRe: /[A-Za-z0-9]+/g,
miscRe: /[- !.?・、「」×〜&/]/g
}
$(setup);