Jump to content

User:Tom.Bot/Task3 code

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.

Source

public string ProcessArticle(string ArticleText, string ArticleTitle, int wikiNamespace, out string Summary, out bool Skip)
{
	// global switches //////////////////////////////////////////////////////////
	
	bool SaveSkipSummaries = false;
	bool SkipPagesLargerThanLimit = false; // used with int Limit
	bool ManuallyCheckPagesWithoutAGoodInfobox = false; // usually it's an {{infobox person}} or {{infobox scientist}}
	bool ManuallyPlaceTaxonbarAtEndOfPage = false; // aid for pages w/o a {{DEFAULTSORT}} nor cats; manual only
	bool LiveDebug = false;
	bool SandboxDebug = false; // auto-detects
	Skip = false;
	
	
	// global-use vars //////////////////////////////////////////////////////////
	
	int Limit = 2500; // characters/bytes on a page; used with bool SkipPagesLargerThanLimit
	Summary = "";
	
	
	// preliminary exceptions/error checking ////////////////////////////////////
	
	if (ArticleTitle == "User:Tom.Reding/sandbox") SandboxDebug = true;
	
	if (SkipPagesLargerThanLimit)
	{
		string TooBig_Regex = @"^[\d\D]{" + (Limit + 1) + "}";
		bool TooBig = Regex.IsMatch(ArticleText, TooBig_Regex);
		if (TooBig)
		{
			Summary += "Too big (>" + Limit + "B). ";
			Skip = true;
		}
	}
	
	// check for inappropriate infoboxes
	string PeopleTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+actor[ _]+voice|[Ii]nfobox[ _]+Actor|[Ii]nfobox[ _]+actor|[Ii]nfobox[ _]+Actress|[Ii]nfobox[ _]+actress|[Ii]nfobox[ _]+adult[ _]+biography|[Ii]nfobox[ _]+adult[ _]+female|[Ii]nfobox[ _]+adult[ _]+male|[Ii]nfobox[ _]+Biography|[Ii]nfobox[ _]+biography|[Ii]nfobox[ _]+bio|[Ii]nfobox[ _]+Celebrity|[Ii]nfobox[ _]+director|[Ii]nfobox[ _]+entertainer|[Ii]nfobox[ _]+Fashion[ _]+Designer|[Ii]nfobox[ _]+fashion[ _]+designer|[Ii]nfobox[ _]+film[ _]+actor|[Ii]nfobox[ _]+film[ _]+director|[Ii]nfobox[ _]+human[ _]+being|[Ii]nfobox[ _]+human|[Ii]nfobox[ _]+Indian[ _]+Businessmen|[Ii]nfobox[ _]+Journalist|[Ii]nfobox[ _]+journalist|[Ii]nfobox[ _]+people|[Ii]nfobox[ _]+performer|[Ii]nfobox[ _]+person/measurements|[Ii]nfobox[ _]+person[ _]+ii|[Ii]nfobox[ _]+person|[Ii]nfobox[ _]+Person|[Ii]nfobox[ _]+photographer|[Ii]nfobox[ _]+Real[ _]+Person|[Ii]nfobox[ _]+trade[ _]+unionist|[Ii]nfobox[ _]+victim|[Pp]ersonbox)(?=\s*(?:\||\<\!\-\-))";
	string ScientistTemplates_Regex = @"\{\{\s*(?:[Ii]nfobox[ _]+Academic|[Ii]nfobox[ _]+chemist|[Ii]nfobox[ _]+historian|[Ii]nfobox[ _]+mathematician|[Ii]nfobox[ _]+Professor|[Ii]nfobox[ _]+scientist|[Ii]nfobox[ _]+Scientist)(?=\s*(?:\||\<\!\-\-))";
	bool BadInfobox1 = Regex.IsMatch(ArticleText, PeopleTemplates_Regex, RegexOptions.IgnoreCase);
	bool BadInfobox2 = Regex.IsMatch(ArticleText, ScientistTemplates_Regex, RegexOptions.IgnoreCase);
	if (BadInfobox1 || BadInfobox2)
	{
		Summary += @"Person/scientist infobox found. ";
		Skip = true;
	}
	
	// check for appropriate infoboxes
	string TitleTemplates_Regex = @"\{\{\s*(?:DISPLAY ?TITLE|[Ii]talicisedtitle|[Ii]talicised[ _]+title|[Ii]talicizedtitle|[Ii]talicized[ _]+title|[Ii]talicizetitle|[Ii]talicize[ _]+title|[Ii]talicstitle|[Ii]talics[ _]+title|[Ii]talics|ITALICTITLE|[Ii]talictitle|[Ii]talic[ _]+title[ _]+infobox|[Ii]talic[ _]+title|[Ii]talic|[Ii]tal|[Rr]edirect[ _]+italic[ _]+title|[Tt]itle[ _]+italic)";
	
	string TaxoTemplates_Regex = @"\{\{\s*(?:Template:\s*|Wikipedia:\s*)?(?:Infobox[ _]+)?(" + // prefixes
										@"Taxobox|Taxo|TX|Species ?box|Subspeciesbox|Infraspeciesbox|Subspeciesbox/ICN|" + // taxo/species
										@"Automatic[ _]+t?axobox|" + // auto
										@"bacteria|microorganism|virus" + // other
										@")(?=\s*(?:\||\<\!\-\-|" + TitleTemplates_Regex + @"|(?<=Automatic[ _]+t?axobox\s*)\}\}))"; // suffixes
	bool NoTaxoTemplates = !Regex.IsMatch(ArticleText, TaxoTemplates_Regex, RegexOptions.IgnoreCase);
	if (NoTaxoTemplates)
	{
		if (ManuallyCheckPagesWithoutAGoodInfobox)
		{
			if (!BadInfobox1 && !BadInfobox2)
			{
				// OK to proceed (manually)
			}
			else
			{
				// Skip is already true from 'inappropriate infoboxes' check
			}
		}
		else
		{
			Summary += @"No auto/taxo/speciesbox found. ";
			Skip = true;
		}
	}
	
	// check for {{Taxonbar
	string TaxonbarAliases_Regex = @"\{\{\s*(?:[Tt]axobar|[Tt]axon\-bar|[Tt]axonbar|[Tt]axonBar|[Tt]axonIds|[Tt]axon[ _]+bar)"; // 0 grps
	bool HasTaxonbar = Regex.IsMatch(ArticleText, TaxonbarAliases_Regex, RegexOptions.IgnoreCase);
	if (HasTaxonbar)
	{
		Summary += @"Taxonbar exists. ";
		Skip = true;
	}
	
	// get wikibase_item via WP API
	// ex: https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=Panthera%20leo&redirects=0&formatversion=2&ppprop=wikibase_item
	// wish I could find a URL_Encode function that worked....
	string ArticleTitle_URL = ArticleTitle.Replace(" ", @"%20").Replace(",", @"%2C").Replace("'", @"%27").Replace("-", @"%2D").Replace("–", @"%96").Replace("(", @"%28").Replace(")", @"%29").Replace(".", @"%2E").Replace("&", @"%26").Replace("?", @"%3F").Replace("+", @"%2B").Replace(":", @"%3A").Replace("!", @"%21").Replace("/", @"%2F").Replace(@"\", @"%5C");
	string URL1 = @"https://en.wikipedia.org//w/api.php?action=query&format=json&prop=pageprops&titles=" + 
						ArticleTitle_URL + @"&redirects=0&formatversion=2&ppprop=wikibase_item";
	string HTML1 = "";
	if (!Skip && !SandboxDebug)
	{
		try
		{
			HTML1 = Tools.GetHTML(URL1);
		}
		catch
		{
			Summary = "GetHTML1 failed. ArticleTitle_URL = " + ArticleTitle_URL + " . ";
			if (!LiveDebug) Skip = true;
		}
	}
	
	
	// html1 error checks ///////////////////////////////////////////////////////
	
	string QID = Regex.Match(HTML1, @"wikibase_item"":""([^""]+)").Groups[1].Value;
	if (string.IsNullOrEmpty(QID) && !Skip && !SandboxDebug)
	{
		Summary = @"QID retrieval failed. ";
		Skip = true;
	}
	
	if (!Regex.IsMatch(QID, @"^Q\d+$") && !Skip && !SandboxDebug) // case sensitive, jtbs
	{
		Summary = @"Unexpected QID format. ";
		Skip = true;
	}
	
	
	// determine quantity & quality of WD properties used ///////////////////////
	
	List<string> GoodPropertyList = new List<string>(new string[] {
		// alphabetically from [[Template:Taxonbar#Taxon identifiers]]:
		"P4024",
		"P2036",
		"P1348",
		"P3594",
		"P2833",
		"P2026",
		"P2946",
		"P3398",
		"P838",
		"P687",
		"P2464",
		"P3060",
		"P1940",
		"P3444",
//		"P830",	// ignore: EOL, Encyclopedia of Life
		"P1895",
		"P938",
		"P3101",
		"P1727",
		"P3100",
		"P1747",
		"P842",
//		"P846",	// ignore: GBIF, Global Biodiversity Information Facility
		"P1832",
		"P1421",
		"P3099",
		"P1076",
		"P3151",
		"P1391",
		"P961",
		"P586",
		"P815",
		"P627",
		"P3064",
		"P1991",
		"P959",
		"P962",
		"P685",
		"P4122",
		"P2434",
		"P3102",
//		"P1070",	// ignore: TPL, The Plant List
		"P1772",
		"P1992",
		"P2040",
		"P2455",
		"P960",
		"P1745",
		"P1761",
		"P3591",
		"P850",
		"P3288",
		"P2426",
		"P1746"
	}); // ignores don't count towards the total property count, per [[WT:TREE#Taxonbar addition requirements]]
	
	List<string> BadPropertyList = new List<string>(new string[] {
		"P830",	// ignore: EOL, Encyclopedia of Life
		"P846",	// ignore: GBIF, Global Biodiversity Information Facility
		"P1070",	// ignore: TPL, The Plant List
		
		// remaining 13 uniques from [[d:Wikidata:WikiProject Taxonomy#Databases]]:
		// [[Module:Taxonbar/conf]] needs updating (follow up after bulk run)
		"P1939",
		"P2752",
		"P2794",
		"P3088",
		"P3186",
		"P3322",
		"P3420",
		"P3606",
		"P4125",
		"P4194",
		"P4301",
		"P4311",
		"P4526"
	});
	
	// get Wikidata
	// ex: https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=Q36557
	string URL2 = @"https://www.wikidata.org//w/api.php?action=wbgetclaims&format=json&entity=" + QID;
	string HTML2 = "";
	if (!Skip && !SandboxDebug)
	{
		try
		{
			HTML2 = Tools.GetHTML(URL2);
		}
		catch
		{
			Summary = "GetHTML2 failed. URL2 = " + URL2 + " . ";
			if (!LiveDebug) Skip = true;
		}
	}
	
	// scrape Wikidata
	// example text surrounding a populated property:
	//        "P959": [
	//            {
	//                "mainsnak": {
	//                    "snaktype": "value",
	//                    "property": "P959",
	//                    "hash": "c18d910a13321717e90ba037d26f1f1b86558128",
	//                    "datavalue": {
	//                        "value": "11500009",
	//                        "type": "string"
	//                    },
	//                    "datatype": "external-id"
	//                },
	int iGoodProps = 0;
	int iBadProps = 0;
	if (!Skip && !SandboxDebug)
	{
		foreach (string p in GoodPropertyList)
		{
			string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
			bool Found = Regex.IsMatch(HTML2, p_regex);
			if (Found) iGoodProps++;
		}
		
		foreach (string p in BadPropertyList)
		{
			string p_regex = @"""property"":\s*""" + p + @""",[^\{\}]*""datavalue"":\s*\{\s*""value"":\s*""[^""]+""";
			bool Found = Regex.IsMatch(HTML2, p_regex);
			if (Found) iBadProps++;
		}
		
		if (iGoodProps == 0)
		{
			if (iBadProps > 0) Summary += "No good PIDs found. ";
			else Summary += "No PIDs found. ";
			Skip = true;
		}
	}
	
	
	// main /////////////////////////////////////////////////////////////////////
	
	if (!Skip)
	{
		if (SandboxDebug)
		{
			iGoodProps = 1;
			QID = "1";
		}
		
		// move {{-stub}} tag closer to end of page, otherwise GenFixes adds an extra line before {{Taxonbar}} that can't be fixed w/o a reparse ([[Smythea]])
		// leading "\s*" & "\n" for cases like "{{reflist}}{{Malvales-stub}}" ([[Herrania mariae]])
		string MoveStubAfterCat_Regex = @"\s*(\{\{[^\{\}]*[ -]stub\s*\}\})\s*(\[\[\s*Category[^\[\]]+\]\])";
		ArticleText = Regex.Replace(ArticleText, MoveStubAfterCat_Regex, "\n" + @"$2" + "\n" + @"$1", RegexOptions.IgnoreCase);
		
		string Plural = (iGoodProps > 1) ? "s" : "";
		string TaxonbarComplete = @"{{Taxonbar|from=" + QID + @"}}";
		string AddBeforeCats_Regex = @"(^[\d\D]+?)(?=[\r\n]+[ 	]*(?:\{\{\s*Default ?sort|\[\[\s*Category))"; // better results than adding after last cat ([[Hellolycaena]])
		string SuccessSummary = @"+{{[[Template:Taxonbar|Taxonbar]]|" + 
										@"[[:Category:Taxonbar templates without from parameter|from]]=" + 
										@"[[d:Special:EntityPage/" + QID + @"|" + QID + @"]]}} " + 
										@"([[WT:TREE#Taxonbar addition requirements|" + iGoodProps + @" sig. taxon ID" + Plural + @"]]); " +
										@"[[WP:GenFixes]] on,";
		bool NoCat = !Regex.IsMatch(ArticleText, AddBeforeCats_Regex, RegexOptions.IgnoreCase);
		if (NoCat)
		{
			if (ManuallyPlaceTaxonbarAtEndOfPage)
			{
				ArticleText += "\n" + TaxonbarComplete;
				Summary = SuccessSummary + " (uncategorized page) ";
			}
			else
			{
				Summary += @"No cats/defaultsort to anchor {{Taxonbar}} around. Batch manually/code later. ";
				Skip = true;
			}
		}
		else
		{
			ArticleText = Regex.Replace(ArticleText, AddBeforeCats_Regex, @"$1" + "\n" + TaxonbarComplete, RegexOptions.IgnoreCase);
			Summary = SuccessSummary;
		}
	}
	
	
	// exception tracking ///////////////////////////////////////////////////////
	
	if (Skip && SaveSkipSummaries && !SandboxDebug)
	{
		string Message = ArticleTitle + "\t" + Summary + "\n";
		string File = @"Module output - Add {{Taxonbar+from}} (skip summaries).txt";
		string Path = @"F:\"; // desktop
		string FullPath = Path + File;
		const bool APPEND = true;
		Tools.WriteTextFileAbsolutePath(Message, FullPath, APPEND);
	}
	
	if (LiveDebug || SandboxDebug) Skip = false;
	
	return ArticleText;
}