Jump to content

User:PharyngealImplosive7/IPAlinker.py

From Wikipedia, the free encyclopedia
This is an old revision of this page, as edited by PharyngealImplosive7 (talk | contribs) at 23:18, 21 April 2025 (create). The present address (URL) is a permanent link to this revision, which may differ significantly from the current revision.
(diff) ← Previous revision | Latest revision (diff) | Newer revision → (diff)
import pywikibot
from mwparserfromhell import parse, nodes
import os
import random

# Configuration
USERNAME = "PharyngealImplosive7"
CATEGORY_TITLE = "Category:Languages"
PROCESSED_FILE = "processed_pages.txt"
PAGE_LIMIT = 5
MAX_IPA_LENGTH = 5
API_CHUNK_SIZE = 50  # For faster category loading

def load_processed_pages():
    return set() if not os.path.exists(PROCESSED_FILE) else set(open(PROCESSED_FILE).read().splitlines())

def save_processed_page(page_title):
    with open(PROCESSED_FILE, 'a') as f:
        f.write(f"{page_title}\n")

def fast_category_loader(site, category_name):
    """Optimized category page loader with batch requests"""
    cat = pywikibot.Category(site, category_name)
    return cat.articles(namespaces=0, recurse=True, total=API_CHUNK_SIZE)

def process_ipa_templates(wikicode):
    changes = 0
    for template in wikicode.ifilter_templates():
        if template.name.strip().lower() == 'ipa':
            # Table check
            parent = template.parent
            while parent and not (isinstance(parent, nodes.Tag) and parent.tag.lower() == 'table'):
                parent = parent.parent
            if not parent:
                continue

            # Parameter check
            if not template.params:
                continue

            # Content processing
            raw_content = str(template.params[0].value).strip()
            content = raw_content
            brackets = {'(': ')', '[': ']', '{': '}'}
            
            # Remove existing brackets
            if len(content) >= 2 and content[0] in brackets and content[-1] == brackets[content[0]]:
                content = content[1:-1].strip()

            # Length check
            if len(content) > MAX_IPA_LENGTH:
                continue

            # Replacement logic
            template.name = "IPA link"
            if content != raw_content:  # Had brackets
                template.params[0].value = content
                # Add brackets as text nodes
                parent = template.parent.parent
                index = parent.nodes.index(template.parent)
                parent.nodes[index:index+1] = [
                    nodes.Text(raw_content[0]),
                    template,
                    nodes.Text(raw_content[-1])
                ]
            changes += 1
    return changes

def process_page(page):
    print(f"\nProcessing: {page.title()}")
    try:
        text = page.get()
    except pywikibot.exceptions.NoPageError:
        return False

    wikicode = parse(text)
    changes = process_ipa_templates(wikicode)

    if changes:
        new_text = str(wikicode)
        print(f"🔧 Found {changes} IPA conversions")
        pywikibot.showDiff(text, new_text)
        
        if input("Save changes? (y/n): ").lower() == 'y':
            page.text = new_text
            page.save(summary=f"BOT: IPA conversion ({changes} templates)", botflag=True)
            return True
    else:
        print("No changes needed")
    return False

def main():
    site = pywikibot.Site('en', 'wikipedia', user=USERNAME)
    processed = load_processed_pages()
    
    print("🔍 Scanning category pages...")
    candidate_pages = [
        p for p in fast_category_loader(site, CATEGORY_TITLE)
        if p.title() not in processed
    ][:PAGE_LIMIT*2]
    
    random.shuffle(candidate_pages)
    processed_count = 0
    
    for page in candidate_pages[:PAGE_LIMIT]:
        print(f"\n=== Processing page {processed_count + 1}/{PAGE_LIMIT} ===")
        if process_page(page):
            processed_count += 1
        save_processed_page(page.title())
    
    print(f"\nCompleted processing {processed_count} pages")

if __name__ == "__main__":
    main()