Jump to content

User:Phlsph7/ListUnreferencedParagraphs.js

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
Note: After saving, you have to bypass your browser's cache to see the changes. Google Chrome, Firefox, Microsoft Edge and Safari: Hold down the ⇧ Shift key and click the Reload toolbar button. For details and instructions about other browsers, see Wikipedia:Bypass your cache.
(function(){
	const scriptName = 'List Unreferenced Paragraphs';

	$.when(mw.loader.using('mediawiki.util'), $.ready).then(function(){
		const listPortletlink = mw.util.addPortletLink('p-tb', '#', scriptName, scriptName + 'Id');
		listPortletlink.onclick = function(e) {
			e.preventDefault();
			listUnreferencedParagraphs();
		};
		
		const highlightPortletlinkName = 'Highlight Unreferenced Paragraphs';
		const highlightPortletlink = mw.util.addPortletLink('p-tb', '#', highlightPortletlinkName, highlightPortletlinkName + 'Id');
		highlightPortletlink.onclick = function(e) {
			e.preventDefault();
			highlightUnreferencedParagraphs();
		};
	});
	
	function listUnreferencedParagraphs(){
		const timeout = 50;
		let stopProcessing = false;
		const content = document.getElementById('content');
		const contentContainer = content.parentElement;
		content.style.display = 'none';

		let scriptContainer = document.createElement('div');
		contentContainer.appendChild(scriptContainer);
		scriptContainer.outerHTML = `
	<div id="scriptContainer" style="display:flex; flex-direction: column;">
		<style>
			textarea {
				resize: none;
				padding: 5px;
			}
			button {
				margin: 5px;
			}
		</style>
		<h1>Unreferenced Paragraph Counter</h1>
		<div style="display:flex;">
			<div style="flex: 1; display:flex; flex-direction: column; margin: 5px;  height: 50vh; overflow-y: auto;">
				<label for="taList">Article Titles</label>
				<textarea id="taList" style="height: 100%;"></textarea>
			</div>
			<div style="flex: 2; display:flex; flex-direction: column; margin: 5px; height: 50vh; overflow-y: auto;">
				<label for="tableCounter">Overview table</label>
				<table id="tableCounter" class="wikitable" style="height: 100%; margin: 0px; width: 100%; border-collapse: collapse;">
					<thead>
						<tr>
							<th>Article title</th>
							<th title="paragraphs that require and lack references">Paragraphs without references</th>
							<th>Maintenance tags</th>
						</tr>
					</thead>
					<tbody id="tbodyCounter">
					
					</tbody>
				</table>
			</div>
		</div>
		<div style="display:flex; flex-direction: column">
			<div style="display:flex;">
				<button id="btStart" style="flex: 1;">Start</button>
				<button id="btStop" disabled style="flex: 1;">Stop</button>
				<button id="btCopy" style="flex: 1;">Copy</button>
			</div>
			<div>
				<button id="btClose" style="width: 100%;">Close</button>
			</div>
		</div>
	</div>
	`;
		const btStart = $('#btStart');
		btStart.click(function(){
			stopProcessing = false;
			btStart.prop("disabled", true);
			btStop.prop("disabled", false);
			
			let articleTitles = $('#taList').val().trim()
				.split('\r').join('')
				.split('\n');
				
			// remove duplicates
			articleTitles = [...new Set(articleTitles)];
			
			// populate table
			$("#tbodyCounter").empty();
			for(let i = 0; i < articleTitles.length; i++){
				let linkHTML = getLinkHTML(articleTitles[i]);
				let row = `<tr><td>${linkHTML}</td><td id="td_unref_${i}" style="text-align: center;">-</td><td id="td_tags_${i}"></td></tr>`;
				$("#tbodyCounter").append(row);
			}
				
			recursivelyProcessArticles(articleTitles, 0, timeout);
			
			function getLinkHTML(articleTitle) {
			    var link = document.createElement('a');
			    link.href = 'https://en.wikipedia.org/wiki/' + encodeURIComponent(articleTitle);
			    link.textContent = articleTitle;
			    return link.outerHTML;
			}
			
		});
		const btStop = $('#btStop');
		btStop.click(function(){
			stopProcessing = true;
			btStart.prop("disabled", false);
			btStop.prop("disabled", true);
		});
		const btCopy = $('#btCopy');
		btCopy.click(function(){
			const tableText =  getTextViaSelection();
			copyToClipboard(tableText);
			mw.notify("The table was copied to the clipboard.");
			
			function getTextViaSelection(){
				const tbodyCounter = $('#tbodyCounter')[0];
				const range = document.createRange();
				range.selectNodeContents(tbodyCounter);

				const selection = window.getSelection();
				selection.removeAllRanges();
				selection.addRange(range);
				return selection.toString();
			}
			
			function copyToClipboard(text) {
				const textarea = document.createElement('textarea');
				textarea.value = text;
				document.body.appendChild(textarea);
				textarea.select();
				document.execCommand('copy');
				document.body.removeChild(textarea);
			}
		});
		const btClose = $('#btClose');
		btClose.click(function(){
			btStop.trigger('click');
			let scriptContainer = document.getElementById('scriptContainer');
			scriptContainer.parentElement.removeChild(scriptContainer);
			content.style.display = '';
		});

		function recursivelyProcessArticles(articleTitles, index, timeout){
			if(!stopProcessing && index < articleTitles.length){
				btStop.text(`Stop (${index}/${articleTitles.length})`);
				const articleTitle = articleTitles[index];
				processArticle(articleTitles, index);
				
				setTimeout(function(){recursivelyProcessArticles(articleTitles, index + 1, timeout);}, timeout);
			}
			else{
				btStop.text(`Stop`);
				btStop.trigger('click');
			}
		}

		function processArticle(articleTitles, index){
			const articleTitle = articleTitles[index];
			const articleSearchTerm = encodeURIComponent(articleTitle);
			let wikiApiUrl = `https://en.wikipedia.org/w/api.php?action=parse&page=${articleSearchTerm}&format=json`;
			fetch(wikiApiUrl).then(async function(response) { // jshint ignore:line
				const data = await response.json();
				const cellUnrefId = `td_unref_${index}`;
				const cellTagsId = `td_tags_${index}`;
				if (data && data.parse && data.parse.text && data.parse.text['*']) {
					
					const articleHTML = data.parse.text['*'];
					const parser = new DOMParser();
					const doc = parser.parseFromString(articleHTML, 'text/html');
					const paragraphContainer = $(doc).find('.mw-parser-output').eq(0);
					
					const paragraphInfo = getParagraphInfo(paragraphContainer);
					const unreferencedParagraphs = paragraphInfo.unreferencedParagraphs;
					const includedParagraphs = paragraphInfo.includedParagraphs;
					
					//const count = `${unreferencedParagraphs.length} / ${includedParagraphs.length}`;
					const count = `${unreferencedParagraphs.length}`;
					$('#' + cellUnrefId).html(count);
					
					const maintenanceTagString = getMaintenanceTagString(paragraphContainer);
					$('#' + cellTagsId).html(maintenanceTagString);
				} else {
					$('#' + cellUnrefId).html('error');
					$('#' + cellTagId).html('error');
				}
			});
		}
		
		function getMaintenanceTagString(element){
			const templateOverview = {};
			const amboxes = getAmboxes(element);
			for(const ambox of amboxes){
				const amboxType = getAmboxTyp(ambox);
				updateOverview(templateOverview, amboxType);
			}

			const inlineTemplates = getInlineTemplates(element);
			for(const inlineTemplate of inlineTemplates){
				const inlineTemplateType = getInlineTemplateType(inlineTemplate);
				updateOverview(templateOverview, inlineTemplateType);
			}

			const overviewString = getOverviewString(templateOverview);
			return overviewString;

			function getInlineTemplates(element){
				return element.find('.Inline-Template').toArray();
			}

			function getInlineTemplateType(inlineTemplate){
				let innerText = inlineTemplate.innerText;
				let type = innerText.substring(1, innerText.length - 1);
				return type;
			}

			function getAmboxes(element){
				return element.find('.ambox').toArray();
			}

			function getAmboxTyp(ambox){
				for(const entry of ambox.classList){
					if(entry.substring(0,4) === 'box-'){
						return entry.substring(4).split('_').join(' ');
					}
				}
				
				return entry.innerText;
			}

			function updateOverview(overview, entry){
				if(Object.keys(overview).includes(entry)){
					overview[entry]++;
				}
				else{
					overview[entry] = 1;
				}
			}

			function getOverviewString(overview){
				let overviewString = '';
				const keys = Object.keys(overview);
				if(keys.length > 0){
					for(const key of keys){
						const count = overview[key];
						overviewString += count + 'x ';
						overviewString += key + ', ';
					}
					
					overviewString = overviewString.substring(0, overviewString.length - 2);
				}
				
				return overviewString;
			}
		}
	}
	
	function highlightUnreferencedParagraphs(){
		const paragraphContainer = $('#mw-content-text').find('.mw-parser-output').eq(0);
		const paragraphInfo = getParagraphInfo(paragraphContainer);
		const includedParagraphs = paragraphInfo.includedParagraphs;
		const unreferencedParagraphs = paragraphInfo.unreferencedParagraphs;

		for(let p of includedParagraphs){
			if(unreferencedParagraphs.includes(p)){
				p.style.background = '#faa';
			}
			else{
				p.style.background = '#afa';
			}
		}
		
		console.log(unreferencedParagraphs);
		mw.notify(`${unreferencedParagraphs.length} unreferenced paragraphs found`);
	}
	
	function getParagraphInfo(paragraphContainer){
		const minimalParagraphLength = 100;
		
		hideRefs(paragraphContainer[0]);
		
		combineMathBlocks(paragraphContainer.children().toArray());
		addElementsFollowingParagraphs(paragraphContainer.children().toArray());
		addElementsPrecedingParagraphs(paragraphContainer.children().toArray());
		
		showRefs(paragraphContainer[0]);
		
		const children = paragraphContainer.children();
		const releventChildren = [];
		for(let child of children){
			if(child.tagName.toLowerCase() === 'p'){
				releventChildren.push(child);
			}
			else if(child.classList.contains('mw-heading2')){
				releventChildren.push(child);
			}
		}
		
		const articleObject = convertToObject(releventChildren);
		removeIrrelevantSections(articleObject);
		const paragraphsInRelevantSections = convertToSimpleArray(articleObject);
		const includedParagraphs = removeShortParagraphs(paragraphsInRelevantSections);
		const unreferencedParagraphs = getUnreferencedParagraphs(includedParagraphs);

		return {
			'includedParagraphs': includedParagraphs,
			'unreferencedParagraphs': unreferencedParagraphs
		};
		
		function hideRefs(element){
			let refs = element.querySelectorAll('.reference, .Inline-Template');
			
			for(let ref of refs){
				ref.style.display = 'none';
			}
		}
		
		function showRefs(element){
			let refs = element.querySelectorAll('.reference, .Inline-Template');
			
			for(let ref of refs){
				ref.style.display = '';
			}
		}
		
		// includes the elements before and after a paragraph consisting only of a math formula into one element; this is based on the idea that the math formula artifically divides a single paragraph into parts
		function combineMathBlocks(elements){
			for(let i = 1; i < elements.length-1; i++){
				let previousElement = elements[i-1];
				let element = elements[i];
				let nextElement = elements[i+1];
				if(isMathBlock(elements[i])){
					previousElement.appendChild(element);
					previousElement.appendChild(nextElement);
				}
			}
			
			function isMathBlock(element){
				if(element.firstChild && element.firstChild.classList){
					if(element.firstChild.classList.contains('mwe-math-element')){
						if(element.innerText === element.firstChild.innerText){
							return true;
						}
					}
				}
				
				return false;
			}
		}
		
		// if the meaning of the passage does not end with the html paragraph then add the next element to it.
		function addElementsFollowingParagraphs(elements){
			for(let i = 0; i < elements.length-1; i++){
				let element = elements[i];
				let clone = element.cloneNode(true);
				removeStyleElements(clone);
				let innerText = clone.innerText.trim();
				if(element.tagName === 'P' && innerText.length > 0){
					let lastCharacter = innerText[innerText.length-1];
					const nonEndingCharacters = [',', ':'];
					if(nonEndingCharacters.includes(lastCharacter) || isLetter(lastCharacter)){
						let nextElement = elements[i+1];
						element.appendChild(nextElement);
						if(nextElement.tagName === 'STYLE' || nextElement.tagName === 'LINK'){
							if(i+2 < elements.length -1){
								let nextNextElement = elements[i+2];
								element.appendChild(nextNextElement);
							}
						}
					}
				}
			}
			
			function isLetter(character){
				return character.toLowerCase() !== character.toUpperCase();
			}
			
			function removeStyleElements(element){
				let styleElements = element.getElementsByTagName('style');
				for(const styleElement of styleElements){
					styleElement.remove();
				}
			}
		}
		
		// if a paragraph starts in the middle then add the previous element
		function addElementsPrecedingParagraphs(elements){
			for(let i = 1; i < elements.length; i++){
				let element = elements[i];
				let innerText = element.innerText.trim();
				if(element.tagName === 'P' && innerText.length > 0){
					let firstCharacter = innerText[0];
					if(isLowerCaseLetter(firstCharacter)){
						let previousElement = elements[i-1];
						element.insertBefore(previousElement, element.firstChild);
					}
				}
			}
			
			function isLowerCaseLetter(character){
				return character.toLowerCase() !== character.toUpperCase() && character === character.toLowerCase();
			}
		}

		function convertToObject(elementArray){
			const articleObject = {};
			let currentSection = "Lead";
			articleObject["Lead"] = []; // jshint ignore:line
			
			for(let element of elementArray){
				if(element.classList.contains('mw-heading2')){
					currentSection = element.innerText.split('[edit]').join('');
					articleObject[currentSection] = [];
				}
				else{
					articleObject[currentSection].push(element);
				}
			}
			
			return articleObject;
		}

		function removeIrrelevantSections(articleObject){
			const excludedSections = ['Lead', 'Plot', 'Plots', 'Plot summary', 'Plot synopsis', 'Synopsis', 'Storylines', 'Appearances', 'Further reading', 'See also', 'External links', 'References', 'Bibliography', 'Notes', 'Selected publications', 'Selected works', 'Cited sources', 'Sources', 'Footnotes'];
			for(let sectionName in articleObject){
				if(excludedSections.indexOf(sectionName) != -1){
					delete articleObject[sectionName];
				}
			}
		}

		function convertToSimpleArray(articleObject){
			let array = [];
			for (let sectionName in articleObject){
				array = array.concat(articleObject[sectionName]);
			}
			
			return array;
		}

		function removeShortParagraphs(paragraphArray){
			const longParagraphs = [];
			for(let paragraph of paragraphArray){
				if(paragraph.innerText.length >= minimalParagraphLength){
					longParagraphs.push(paragraph);
				}
			}
			
			return longParagraphs;
		}

		function getUnreferencedParagraphs(paragraphArray){
			const unreferencedParagraph = [];
			for(let paragraph of paragraphArray){
				if(isUnreferenced(paragraph)){
					unreferencedParagraph.push(paragraph);
				}
			}
			return unreferencedParagraph;
		}

		function isUnreferenced(paragraph){
			let hasRegularRef = $(paragraph).find('.reference').length > 0;
			let hasHarvRef = false;
			const links = $(paragraph).find('a').toArray();
			for(const link of links){
				let href = link.getAttribute('href');
				if(href && href.substring(0, 8) == '#CITEREF'){
					hasHarvRef = true;
				}
			}
			
			return !(hasRegularRef || hasHarvRef);
		}
	}
	
})();