Jump to content

User:ARandomName123/BLP.py

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
#Unreferenced and untagged BLP search
#Written by ARandomName123 (with assistance from ChatGPT)
#May take a couple hours for the full category

import pywikibot
from pywikibot import pagegenerators
from tqdm import tqdm
import re
site = pywikibot.Site('en', 'wikipedia')

#Create set of all external link templates
cat = pywikibot.Category(site, 'Category:People and person external link templates')
external_link_templates = set([t.title(with_ns=False).capitalize() for t in cat.articles(recurse = True)])

def extract_templates(wikitext): #Get templates in a page
    matches = re.findall(r'\{\{([^\|\}\n]+)', wikitext)
    templates = [m.strip().capitalize() for m in matches]
    return templates

def check_manual_references(wikitext):
    lines = wikitext.splitlines()
    found_reference_list = False
    found_external_links_list = False

    for i, line in enumerate(lines):

        # Match References == or *
        if re.search(r'\s*References\s*==', line.strip(), re.IGNORECASE):
            for next_line in lines[i+1:]:
                next_line = next_line.strip()
                if next_line == '' or next_line == '{{refbegin}}':
                    continue
                if next_line.startswith('*'):
                    found_reference_list = True
                    break

        # Match == External links == or *
        if re.match(r'==\s*External links\s*==', line.strip(), re.IGNORECASE):
            for next_line in lines[i+1:]:
                next_line = next_line.strip()
                if next_line == '':
                    continue
                if next_line.startswith('*'):
                    found_external_links_list = True
                    break
       
        # Match sources== or *
        if re.search(r'\s*sources\s*==', line.strip(), re.IGNORECASE):
            for next_line in lines[i+1:]:
                next_line = next_line.strip()
                if next_line == '':
                    continue
                if next_line.startswith('*'):
                    found_external_links_list = True
                    break

    return found_reference_list or found_external_links_list

def has_references(wikitext,external_link_templates):

    # Check for <ref> tags
    if re.search(r'<ref\b', wikitext, re.IGNORECASE):
        return True
   
    # Check for any URLs
    if re.search(r'(https?://|www\.)', wikitext, re.IGNORECASE):
        return True
   
    # Check for some common citation templates, along with some misc. ones (usually redirects)
    citation_templates = [
        r'\{\{sfn\b', r'\{\{harvnb\b', r'\{\{cite\b', r'\{\{refn\b', r'\{\{icehockeystats\b'
        , r'\{\{FIBT profile\b' , r'\{\{sports-reference\b', r'\{\{WTA\b' , r'\{\{ITF\b', r'\{\{isu name\b'
        , r'\{\{baseball stats\b', r'\{\{botanist\b'
    ]
   
    for pattern in citation_templates:
        if re.search(pattern, wikitext, re.IGNORECASE):
            return True
           
    # Extract templates to check for EL templates
    page_templates = extract_templates(wikitext)
    for tpl in page_templates:
        if tpl in external_link_templates:
            return True
       
    #Check for references with *
    if check_manual_references(wikitext):
        return True
    return False

cat = pywikibot.Category(site, 'Category:Living people')
#pages = cat.articles(namespaces = 0) #Check all
pages = cat.articles(namespaces = 0, total = 10000) #Check first 10k
numpages = 0
with open('unref.txt','w', encoding = 'utf-8') as file:
        for page in tqdm(pagegenerators.PreloadingGenerator(pages)):
            last_title = page.title()
            if page.isRedirectPage():
                continue
            content = page.text
            if not has_references(content,external_link_templates):
                if not page.isDisambig():
                    numpages += 1
                    pageTitle=page.title()
                    file.write(f'{pageTitle}\n')
                    print(pageTitle)
print(f'last article checked: {last_title}')
print(f'{numpages} unsourced BLPs')