User:Opencooper/showKanji-dev.js
Appearance
Code that you insert on this page could contain malicious content capable of compromising your account. If you import a script from another page with "importScript", "mw.loader.load", "iusc", or "lusc", take note that this causes you to dynamically load a remote script, which could be changed by others. Editors are responsible for all edits and actions they perform, including by scripts. User scripts are not centrally supported and may malfunction or become inoperable due to software changes. A guide to help you find broken scripts is available. If you are unsure whether code you are adding to this page is safe, you can ask at the appropriate village pump. This code will be executed when previewing this page. |
![]() | Documentation for this user script can be added at User:Opencooper/showKanji-dev. |
// This script shows, if found, the kanji and kana for an article
// It then calls another script, bindKana.js, to clean up the display of ruby
// For configuration, please see the documentation
// TODO: Reject if any base/reading has too low/high of a ratio.
// TODO: Reject if unbalanced parenthesis count
// TODO: <rb> is not actually in the whatcg standard...
// License: CC0
/* Sample pages:
https://en.wikipedia.org/wiki/Tamio_Kawachi - kana on wikidata
https://en.wikipedia.org/wiki/A_Fantastic_Tale_of_Naruto - kanji from wikidata only
https://en.wikipedia.org/wiki/What_a_Wonderful_World! - kana from wikidata only
https://en.wikipedia.org/wiki/Asako_I_%26_II - from redirect
https://en.wikipedia.org/wiki/Bokura_ga_Ita_(film) - interwiki to subsection
https://ja.wikipedia.org/wiki/%E7%B4%AF - kana part of bolded title
https://en.wikipedia.org/wiki/Bokutachi_no_Koukan_Nikki - kana not in first sentence
https://en.wikipedia.org/wiki/Domestic_Girlfriend - first full sentence not lead
https://ja.wikipedia.org/wiki/SCP%E8%B2%A1%E5%9B%A3 - bolded term w/ kana past first sentence
https://en.wikipedia.org/wiki/Nuclear_fusion - different term w/ kana in lead
https://en.wikipedia.org/wiki/Oedipus_Rex - other stuff in kana
https://en.wikipedia.org/wiki/20th_Century_Boys - overcapturing because title is subset
https://en.wikipedia.org/wiki/Seiza - kana not at start of parenthesis
https://en.wikipedia.org/wiki/Indentation_style - other kana in disambiguation
https://en.wikipedia.org/wiki/Haven%27t_You_Heard%3F_I%27m_Sakamoto - Halfwidth-fullwidth difference
https://en.wikipedia.org/wiki/Kanji_Furutachi - kanji only
https://en.wikipedia.org/wiki/Anata_e - hiragana only
https://en.wikipedia.org/wiki/Anatahan_(film) - katakana only
https://en.wikipedia.org/wiki/A.LI.CE - latin only
https://en.wikipedia.org/wiki/0.5_mm - numeric
https://en.wikipedia.org/wiki/Truth_Coming_Out_of_Her_Well - angle brackets
https://en.wikipedia.org/wiki/South_of_the_Border,_West_of_the_Sun - kana contains comma
https://en.wikipedia.org/wiki/Leap_year - multiple kana separated by comma
https://en.wikipedia.org/wiki/Do_You_Love_Your_Mom_and_Her_Two-Hit_Multi-Target_Attacks%3F - question mark
https://en.wikipedia.org/wiki/Comic_Magazine - exclamation point
https://en.wikipedia.org/wiki/Tsurune - dash
https://en.wikipedia.org/wiki/Flare_(film) - wave dash
https://en.wikipedia.org/wiki/Dog%C3%97Police - multiplication sign
https://en.wikipedia.org/wiki/Foreboding_(film) - spaces
https://en.wikipedia.org/wiki/Age_12 - period in title
https://en.wikipedia.org/wiki/Suzukake_Nanchara - very long kanji
https://en.wikipedia.org/wiki/After_the_Rain_(manga) - kanji + hiragana
https://en.wikipedia.org/wiki/Afro_Tanaka - kanji + katakana
https://en.wikipedia.org/wiki/Battle_Girl:_The_Living_Dead_in_Tokyo_Bay - katakana + latin
https://en.wikipedia.org/wiki/Calling_You_(short_story_collection) - kanji + hiragana + latin
https://en.wikipedia.org/wiki/Ashita_no_Joe - hiragana + katakana
https://en.wikipedia.org/wiki/Arcadia_of_My_Youth - kanji + hiragana + katakana
https://en.wikipedia.org/wiki/Haou_Airen - special character
https://ja.wikipedia.org/wiki/%E6%98%A0%E7%94%BB_%E8%81%B2%E3%81%AE%E5%BD%A2 - reference in between
https://en.wikipedia.org/wiki/Ninjō - No interlanguage, but wiktionary
https://en.wikipedia.org/wiki/Seiza - Interlanguage failed, but wiktionary
https://en.wikipedia.org/wiki/Epsomite - No interlanguage, but wiktionary "see" Table
https://en.wikipedia.org/wiki/Bakayaro!_I%27m_Plenty_Mad - only part of parenthesis extracted
https://en.wikipedia.org/wiki/ORCID
https://en.wikipedia.org/wiki/Survive_Style_5%2B - fails due to +
https://en.wikipedia.org/wiki/Ko-Shint%C5%8D
https://ja.wikipedia.org/wiki/Terminate_and_Stay_Resident
https://en.wikipedia.org/wiki/Ikk%C5%8D-sh%C5%AB
https://en.wikipedia.org/wiki/Kakegoe - doesn't find jawiki interlanguage
https://en.wikipedia.org/wiki/Love_Live!_The_School_Idol_Movie - interpunct in reading
https://en.wikipedia.org/wiki/Lupin_the_Third:_The_Woman_Called_Fujiko_Mine - hyphen in kanji
https://en.wikipedia.org/wiki/Sunscreen
https://en.wikipedia.org/wiki/Flag_of_China
https://en.wikipedia.org/wiki/W3m
https://en.wikipedia.org/wiki/Magnum_Collection_1999_%22Dear%22
https://en.wikipedia.org/wiki/EC_Comics
https://en.wikipedia.org/wiki/CJK_characters
https://en.wikipedia.org/wiki/My_Girlfriend_is_Shobitch
https://en.wikipedia.org/wiki/Immaculate_Conception_Cathedral,_Nagasaki - partial match
https://en.wikipedia.org/wiki/USA-224 - または
https://en.wikipedia.org/wiki/Milk - bad match
https://en.wikipedia.org/wiki/Not_invented_here
*/
function setup() {
// If we're not reading an article, do nothing
if (!(mw.config.get( 'wgAction' ) === 'view'
&& mw.config.get( 'wgIsArticle' )
&& !location.search.split('oldid=')[1]
&& !mw.config.get("wgIsMainPage")
&& mw.config.get("wgContentLanguage") !== "ja")) {
return;
}
// Assuming that if there's no wikidata, there're no 1:1 interlanguage links,
// and we don't want cases where a page links to a subsection of a jawiki
// article
if (wikidataId === null) {
return;
}
// Placeholder so other elements don't push it down later
var header;
if ($('#firstHeading').length) { // Vector
header = $('#firstHeading');
} else if ($('.page-heading').length) { // Minerva
header = $('.page-heading');
} else {
console.error("showKanji-dev.js: Couldn't find a page heading. This skin ("
+ mw.config.get( 'skin' ) + ") might not be supported.");
return;
}
header.append("<div id='kanjiInfo' lang='ja' dir='ltr'></div>");
// Get the Japanese label from wikidata
// API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetentities
$.ajax({
url: "https://www.wikidata.org/w/api.php",
data: {
action: "wbgetentities",
ids: wikidataId,
props: "labels",
languages: "ja",
format: "json",
origin: "*"
},
success: parseJaLabel
});
}
function parseJaLabel(response) {
var wikidataInfo = response.entities[wikidataId];
var jaLabel;
if (!jQuery.isEmptyObject(wikidataInfo.labels.ja)) {
jaLabel = wikidataInfo.labels.ja.value;
}
if (jaLabel) {
jaLabel = jaLabel.toHalfWidth();
console.log("showKanji-dev.js: kanji: `" + jaLabel + "`");
buildRegexes(jaLabel);
displayKanji(jaLabel);
} else {
return;
}
// If the japanese title is not just only kana, get the reading
if (!kanjiRegexes.kanaOnly.test(jaLabel)) {
requestKana();
}
}
function buildRegexes(kanji) {
// Strip $kanji of all kanji and kana, adding whatever is left to the regex
var reKanjiKana = /[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴー-]/g;
var kanjiStripped = kanji.replace(reKanjiKana, "");
kanjiStripped += " ";
// Need to add hyphen escaped since it has special behavior in regex classes
// TODO: Just escape $kanji early instead, like we did before?
kanjiStripped += "\\-";
var kanjiAuxillary = kanjiStripped.replace(/\w/g, "");
kanjiRegexes.latinOnly = /^[A-Za-z0-9\-.?!/,:;@#$%&+=*'"・ ]+$/;
kanjiRegexes.kanaOnly = new RegExp("^[ぁ-ゔァ-ヴー" + kanjiAuxillary + "]+$");
kanjiRegexes.hiraganaOnly = new RegExp("^[ぁ-ゔーA-Za-z" + kanjiAuxillary + "]+$");
kanjiRegexes.katakanaOnly = new RegExp("^[ァ-ヴーA-Za-z" + kanjiAuxillary + "]+$");
// Add midpoint for Latin in titles
if (/\w/.test(kanji)) { kanjiStripped += "・"; }
console.log("showKanji-dev.js: stripped: `" + kanjiStripped + "`");
var leadReBase = "([ぁ-ゔァ-ヴー" + kanjiStripped + "]+)";
var kanjiEscaped = mw.util.escapeRegExp(kanji);
// Account for spaces, but ignore backslash and other misc characters
var reKanjiKanaLatin = /([\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6Aぁ-ゔァ-ヴーA-Za-z0-9])/g;
var kanjiSpaced = kanjiEscaped.replace(/ /g, " ?");
kanjiSpaced = kanjiSpaced.replace(reKanjiKanaLatin, "$1 ?");
// Add kanji to regex to make sure we're not getting the reading of some
// other term
kanjiRegexes.leadUnspaced = new RegExp(kanjiEscaped + "[^(\n)]*?\\(" + leadReBase);
kanjiRegexes.lead = new RegExp(kanjiSpaced + "[^(\n)]*?\\(" + leadReBase, "i"); // brittle
}
function displayKanji(kanji) {
wikidataKanji = kanji;
$('#kanjiInfo').append("<ruby>" + kanji + "</ruby>");
// Add some classes so users can choose to not display for example
// katakana-only kanji in their CSS
if (kanjiRegexes.latinOnly.test(kanji)) {
$("#kanjiInfo").addClass("kanjiInfo-latin-only");
$("#kanjiInfo").prop("title", "Japanese title in Latin script");
$("#kanjiInfo").css("display", "none");
} else if (kanjiRegexes.hiraganaOnly.test(kanji)) {
$("#kanjiInfo").addClass("kanjiInfo-hiragana-only");
$("#kanjiInfo").prop("title", "Japanese title in hiragana");
} else if (kanjiRegexes.katakanaOnly.test(kanji)) {
$("#kanjiInfo").addClass("kanjiInfo-katakana-only");
$("#kanjiInfo").prop("title", "Japanese title in katakana");
} else {
$("#kanjiInfo").prop("title", "Japanese title in kanji");
}
}
function requestKana() {
// API docs: https://www.wikidata.org/w/api.php?action=help&modules=wbgetclaims
// We have to wholesale get all the claims instead of just one because the
// kana might be present as a qualifier to another claim
$.ajax({
url: "https://www.wikidata.org/w/api.php",
data: {
action: "wbgetclaims",
entity: wikidataId,
format: "json",
origin: "*"
},
success: parseKanaClaim
});
}
function parseKanaClaim(response) {
var kana;
var properties = {
title: "P1476",
nativeLabel: "P1705",
officialName: "P1448",
nameInNativeLanguage: "P1559"
};
var nameInKana = "P1814";
// Try getting nameInKana as a qualifier to some properties
for (var prop in properties) {
var pnum = properties[prop];
if (response.claims[pnum]) {
var kanji = response.claims[pnum][0].mainsnak.datavalue.value.text;
if (kanji.replace(/ /g, "") == wikidataKanji.replace(/ /g, "")
&& response.claims[pnum][0].qualifiers
&& response.claims[pnum][0].qualifiers[nameInKana]) {
kana = response.claims[pnum][0].qualifiers[nameInKana][0].datavalue.value;
break;
}
}
}
// Try getting nameInKana as a general claim
if (!kana && response.claims[nameInKana]) {
prop = "nameInKana";
kana = response.claims[nameInKana][0].mainsnak.datavalue.value;
}
// We couldn't find nameInKana
if (!kana) {
getInterlanguage();
return;
}
kana = kana.toHalfWidth();
displayKana(kana);
$("#kanjiInfo").addClass("kanjiInfo-wikidata");
$("#kanjiInfo").addClass("kanjiInfo-wikidata-" + prop);
}
function getInterlanguage() {
var apiUrl = location.origin + "/w/api.php";
// Documentation: https://en.wikipedia.org/w/api.php?action=help&modules=query%2Blanglinks
$.ajax({
url: apiUrl,
data: {
action: "query",
format: "json",
prop: "langlinks",
lllang: "ja",
titles: mw.config.get( 'wgTitle' )
},
success: function(response) {
var pageId = mw.config.get( 'wgArticleId' );
var page = response.query.pages[pageId];
var langlinks = page ? page.langlinks : undefined;
var jaLabel;
if (langlinks) {
jaLabel = langlinks[0]["*"];
jaLabel = jaLabel.replace(/(.*)#.*/, "$1"); // rm anchors
} else {
getWiktionary();
return;
}
scrapeKana(jaLabel);
}
});
}
function scrapeKana(jaLabel) {
// Get jawiki article's lead wikitext
// API docs: https://www.mediawiki.org/w/api.php?action=help&modules=query%2Bextracts
$.ajax({
url: "https://ja.wikipedia.org/w/api.php",
data: {
action: "query",
prop: "extracts",
format: "json",
redirects: true,
exintro: true,
exsentences: 2,
exlimit: 1,
explaintext: true,
titles: jaLabel,
origin: "*"
},
success: getFirstSentence
});
}
function getFirstSentence(response) {
var responsePart = response.query.pages;
// Have to split parsing into two parts since jawiki pageid is unknown
var pageId = Object.keys(responsePart)[0];
var introText = responsePart[pageId].extract;
if (!introText) {
console.error("showKanji-dev.js: TextExtracts failed to get a lead for the Japanese article.");
getWiktionary();
return;
}
var wikitext = introText.toHalfWidth();
console.log("showKanji-dev.js: lead: `" + wikitext + "`");
console.log("showKanji-dev.js: regex: `" + kanjiRegexes.lead + "`");
console.log("showKanji-dev.js: regex (unspaced): `" + kanjiRegexes.leadUnspaced + "`");
var kana;
var kanaSearch = wikitext.match(kanjiRegexes.lead);
if (kanaSearch && kanaSearch.length == 2) {
kana = kanaSearch[1];
} else {
getWiktionary();
return;
}
// Rm trailing characters
kana = kana.replace(/[・、 ]$/, "");
// Abort if our reading is only katakana (for non-Latin) or Latin
if ((!kanjiRegexes.latinOnly.test(wikidataKanji) && kanjiRegexes.katakanaOnly.test(kana))
|| kanjiRegexes.latinOnly.test(kana)) {
console.log("showKanji-dev.js: throwing away reading: " + kana);
getWiktionary();
return;
}
displayKana(kana);
$("#kanjiInfo").addClass("kanjiInfo-jawiki");
}
// Adapted from:
// http://ilog4.blogspot.com/2015/09/javascript-convert-full-width-and-half.html
// https://stackoverflow.com/a/20488304/1995949
// https://en.wikipedia.org/wiki/Halfwidth_and_fullwidth_forms
String.prototype.toHalfWidth = function() {
var halfWidth = this.replace(/[\uff01-\uff5e]/g, function(s) {return String.fromCharCode(s.charCodeAt(0) - 0xFEE0)});
halfWidth = halfWidth.replace(/ /g, " ");
return halfWidth;
};
// We use the English Wiktionary because it has more terms and better structure
function getWiktionary() {
// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse
$.ajax({
url: "https://en.wiktionary.org/w/api.php",
data: {
action: "parse",
format: "json",
page: wikidataKanji,
prop: "sections",
origin: "*"
},
success: findJapaneseSection
});
}
function findJapaneseSection(response) {
if (response.error) {
console.log("showKanji-dev.js: No Wiktionary item for " + wikidataKanji);
return;
}
var sectionsCount = response.parse.sections.length;
var sectionIndex;
for (let i = 0; i < sectionsCount; i++) {
var sectionHeader = response.parse.sections[i].line;
if (sectionHeader == "Japanese") {
sectionIndex = response.parse.sections[i].index;
break;
}
}
if (sectionIndex == null) {
console.log("showKanji-dev.js: Wiktionary entry doesn't have a section titled 'Japanese'");
return;
}
// API docs: https://en.wikipedia.org/w/api.php?action=help&modules=parse
$.ajax({
url: "https://en.wiktionary.org/w/api.php",
data: {
action: "parse",
format: "json",
page: wikidataKanji,
prop: "text",
section: sectionIndex,
origin: "*"
},
success: parseWiktionary
});
}
function parseWiktionary(response) {
var html = response.parse.text["*"];
var parsed = $($.parseHTML(html));
// Wiktionary adds readings as furigana
var headword = parsed.find(".headword:lang(ja)").first();
var seeTable = parsed.find(".Jpan ruby").first();
var kanji = "";
var kana = "";
if (headword.length) {
// Wiktionary already binds their kana, so we have to undo the process to get
// the constituent parts, at least with the current markup
var childNodes = headword[0].childNodes;
for (let i = 0; i < childNodes.length; i++) {
if (childNodes[i].nodeName == "RUBY") {
var ruby = $(childNodes[i]); // convert back to JQuery for convenience
ruby.children("rp").remove();
kana += ruby.children("rt").detach().text();
kanji += ruby.text();
} else if (childNodes[i].nodeType == 3) { // "#text"
kanji += childNodes[i].nodeValue;
kana += childNodes[i].nodeValue;
}
}
if (kanji != wikidataKanji) { return; }
} else if (seeTable.length) {
kanji = seeTable.children("rb").text();
kana = seeTable.children("rt").text();
} else {
return;
}
if (kana) {
displayKana(kana);
$("#kanjiInfo").addClass("kanjiInfo-wiktionary");
// Extra stuff just for fun
var definition = headword.parent().siblings("ol").children("li").first().text();
definition = definition.split('\n', 1)[0];
definition = definition.replace(/\[[0-9]{1,2}\]/g, "");
$("#kanjiInfo").prop("title", definition);
}
}
function displayKana(kana) {
$("#kanjiInfo ruby").append("<rt>" + kana + "</rt>");
// Cleanup redundant furigana with another script
var kanjiOnlyRe = /^[\u3400-\u4DB5\u4E00-\u9FCB\uF900-\uFA6A]+$/;
if (!kanjiOnlyRe.test(wikidataKanji)) {
mw.loader.load( '//en.wikipedia.org/w/index.php?title=User:Opencooper/bindKana-dev.js&action=raw&ctype=text/javascript' );
}
}
var wikidataId = mw.config.get( 'wgWikibaseItemId' );
var wikidataKanji;
var kanjiRegexes = {};
$(setup);