User:ARandomName123/BLP.py

#Unreferenced and untagged BLP search
#Written by ARandomName123 (with assistance from ChatGPT)
#May take a couple hours for the full category

import pywikibot
from pywikibot import pagegenerators
from tqdm import tqdm
import re
site = pywikibot.Site('en', 'wikipedia')

#Create set of all external link templates
cat = pywikibot.Category(site, 'Category:People and person external link templates')
external_link_templates = set([t.title(with_ns=False).capitalize() for t in cat.articles(recurse = True)])

def extract_templates(wikitext): #Get templates in a page
    matches = re.findall(r'\{\{([^\|\}\n]+)', wikitext)
    templates = [m.strip().capitalize() for m in matches]
    return templates

def check_manual_references(wikitext):
    lines = wikitext.splitlines()
    found_reference_list = False
    found_external_links_list = False

    for i, line in enumerate(lines):

        # Match References == or *
        if re.search(r'\s*References\s*==', line.strip(), re.IGNORECASE):
            for next_line in lines[i+1:]:
                next_line = next_line.strip()
                if next_line == '' or next_line == '{{refbegin}}':
                    continue
                if next_line.startswith('*'):
                    found_reference_list = True
                    break

        # Match == External links == or *
        if re.match(r'==\s*External links\s*==', line.strip(), re.IGNORECASE):
            for next_line in lines[i+1:]:
                next_line = next_line.strip()
                if next_line == '':
                    continue
                if next_line.startswith('*'):
                    found_external_links_list = True
                    break
       
        # Match sources== or *
        if re.search(r'\s*sources\s*==', line.strip(), re.IGNORECASE):
            for next_line in lines[i+1:]:
                next_line = next_line.strip()
                if next_line == '':
                    continue
                if next_line.startswith('*'):
                    found_external_links_list = True
                    break

    return found_reference_list or found_external_links_list

def has_references(wikitext,external_link_templates):

    # Check for <ref> tags
    if re.search(r'<ref\b', wikitext, re.IGNORECASE):
        return True
   
    # Check for any URLs
    if re.search(r'(https?://|www\.)', wikitext, re.IGNORECASE):
        return True
   
    # Check for some common citation templates, along with some misc. ones (usually redirects)
    citation_templates = [
        r'\{\{sfn\b', r'\{\{harvnb\b', r'\{\{cite\b', r'\{\{refn\b', r'\{\{icehockeystats\b'
        , r'\{\{FIBT profile\b' , r'\{\{sports-reference\b', r'\{\{WTA\b' , r'\{\{ITF\b', r'\{\{isu name\b'
        , r'\{\{baseball stats\b', r'\{\{botanist\b'
    ]
   
    for pattern in citation_templates:
        if re.search(pattern, wikitext, re.IGNORECASE):
            return True
           
    # Extract templates to check for EL templates
    page_templates = extract_templates(wikitext)
    for tpl in page_templates:
        if tpl in external_link_templates:
            return True
       
    #Check for references with *
    if check_manual_references(wikitext):
        return True
    return False

cat = pywikibot.Category(site, 'Category:Living people')
#pages = cat.articles(namespaces = 0) #Check all
pages = cat.articles(namespaces = 0, total = 10000) #Check first 10k
numpages = 0
with open('unref.txt','w', encoding = 'utf-8') as file:
        for page in tqdm(pagegenerators.PreloadingGenerator(pages)):
            last_title = page.title()
            if page.isRedirectPage():
                continue
            content = page.text
            if not has_references(content,external_link_templates):
                if not page.isDisambig():
                    numpages += 1
                    pageTitle=page.title()
                    file.write(f'{pageTitle}\n')
                    print(pageTitle)
print(f'last article checked: {last_title}')
print(f'{numpages} unsourced BLPs')