User:ARandomName123/BLP.py
Appearance
#Unreferenced and untagged BLP search
#Written by ARandomName123 (with assistance from ChatGPT)
#May take a couple hours for the full category
import pywikibot
from pywikibot import pagegenerators
from tqdm import tqdm
import re
site = pywikibot.Site('en', 'wikipedia')
#Create set of all external link templates
cat = pywikibot.Category(site, 'Category:People and person external link templates')
external_link_templates = set([t.title(with_ns=False).capitalize() for t in cat.articles(recurse = True)])
def extract_templates(wikitext): #Get templates in a page
matches = re.findall(r'\{\{([^\|\}\n]+)', wikitext)
templates = [m.strip().capitalize() for m in matches]
return templates
def check_manual_references(wikitext):
lines = wikitext.splitlines()
found_reference_list = False
found_external_links_list = False
for i, line in enumerate(lines):
# Match References == or *
if re.search(r'\s*References\s*==', line.strip(), re.IGNORECASE):
for next_line in lines[i+1:]:
next_line = next_line.strip()
if next_line == '' or next_line == '{{refbegin}}':
continue
if next_line.startswith('*'):
found_reference_list = True
break
# Match == External links == or *
if re.match(r'==\s*External links\s*==', line.strip(), re.IGNORECASE):
for next_line in lines[i+1:]:
next_line = next_line.strip()
if next_line == '':
continue
if next_line.startswith('*'):
found_external_links_list = True
break
# Match sources== or *
if re.search(r'\s*sources\s*==', line.strip(), re.IGNORECASE):
for next_line in lines[i+1:]:
next_line = next_line.strip()
if next_line == '':
continue
if next_line.startswith('*'):
found_external_links_list = True
break
return found_reference_list or found_external_links_list
def has_references(wikitext,external_link_templates):
# Check for <ref> tags
if re.search(r'<ref\b', wikitext, re.IGNORECASE):
return True
# Check for any URLs
if re.search(r'(https?://|www\.)', wikitext, re.IGNORECASE):
return True
# Check for some common citation templates, along with some misc. ones (usually redirects)
citation_templates = [
r'\{\{sfn\b', r'\{\{harvnb\b', r'\{\{cite\b', r'\{\{refn\b', r'\{\{icehockeystats\b'
, r'\{\{FIBT profile\b' , r'\{\{sports-reference\b', r'\{\{WTA\b' , r'\{\{ITF\b', r'\{\{isu name\b'
, r'\{\{baseball stats\b', r'\{\{botanist\b'
]
for pattern in citation_templates:
if re.search(pattern, wikitext, re.IGNORECASE):
return True
# Extract templates to check for EL templates
page_templates = extract_templates(wikitext)
for tpl in page_templates:
if tpl in external_link_templates:
return True
#Check for references with *
if check_manual_references(wikitext):
return True
return False
cat = pywikibot.Category(site, 'Category:Living people')
#pages = cat.articles(namespaces = 0) #Check all
pages = cat.articles(namespaces = 0, total = 10000) #Check first 10k
numpages = 0
with open('unref.txt','w', encoding = 'utf-8') as file:
for page in tqdm(pagegenerators.PreloadingGenerator(pages)):
last_title = page.title()
if page.isRedirectPage():
continue
content = page.text
if not has_references(content,external_link_templates):
if not page.isDisambig():
numpages += 1
pageTitle=page.title()
file.write(f'{pageTitle}\n')
print(pageTitle)
print(f'last article checked: {last_title}')
print(f'{numpages} unsourced BLPs')