Jump to content

User:PharyngealImplosive7/Date-checker.py

From Wikipedia, the free encyclopedia
import pywikibot
import mwparserfromhell
import re
from pywikibot import pagegenerators
from difflib import unified_diff
import time
from pywikibot.exceptions import EditConflictError, PageSaveRelatedError
from itertools import chain
import os
from datetime import datetime, timedelta

def get_required_format(page):
    """Check template age"""
    # Basic validation
    if not isinstance(page, pywikibot.Page) or not page.exists():
        return None

    # Skip very new pages (<30 days old)
    page_age = (datetime.now() - page.oldest_revision.timestamp).days
    if page_age < 30:
        print(f"Skipping new page: {page.title()} (created {page_age} days ago)")
        return None

    # Parse current content
    wikicode = mwparserfromhell.parse(page.text)
    date_templates = ['use dmy dates', 'use mdy dates']
    current_templates = []
    
    for template in wikicode.filter_templates():
        template_name = template.name.strip().lower()
        if template_name in date_templates:
            current_templates.append(template_name)
    
    if not current_templates:
        return None
    
    # Verify template exists in last 5 revisions
    try:
        for rev in page.revisions(total=5):
            if 'use dmy dates' in rev.text.lower() or 'use mdy dates' in rev.text.lower():
                return 'dmy' if 'use dmy dates' in current_templates else 'mdy'
    except Exception as e:
        print(f"Error checking revisions for {page.title()}: {str(e)}")
        return None
    
    return None

def handle_edit_conflict(page, fixed_text, violations, retries=3):
    """Handle edit conflicts by retrying"""
    for attempt in range(retries):
        try:
            page.text = fixed_text
            page.save(
                summary=f"PharyngealBOT: Fix {len(violations)} date issues (attempt {attempt+1})",
                bot=False, # Make true when I get approved
                minor=False,
                conflict=attempt+1  # Pywikibot's built-in conflict handling
            )
            return True
        except EditConflictError:
            print(f"Edit conflict detected on {page.title()}, retrying...")
            time.sleep(30 * (attempt+1))
        except PageSaveRelatedError as e:
            print(f"Save error: {str(e)}")
            return False
    return False

def in_excluded_context(text, position):
    """Check if position is within wikilink, template, or reference"""
    # Track all opening/closing tags before the position
    stack = []
    i = 0
    
    while i <= position and i < len(text):
        # Check for opening tags
        if text.startswith('[[', i):
            stack.append(('[[', i))
            i += 2
        elif text.startswith('{{', i):
            stack.append(('{{', i))
            i += 2
        elif text.startswith('<ref', i) and (text[i+4].isspace() or text[i+4] == '>'):
            stack.append(('<ref', i))
            i += 4
        # Check for closing tags
        elif text.startswith(']]', i) and stack and stack[-1][0] == '[[':
            start_type, start_pos = stack.pop()
            if start_pos < position < i:
                return True
            i += 2
        elif text.startswith('}}', i) and stack and stack[-1][0] == '{{':
            start_type, start_pos = stack.pop()
            if start_pos < position < i:
                return True
            i += 2
        elif text.startswith('</ref>', i) and stack and stack[-1][0] == '<ref':
            start_type, start_pos = stack.pop()
            if start_pos < position < i:
                return True
            i += 6
        else:
            i += 1
    
    return False

def find_format_violations(text, required_format):
    """Find all MDY/DMY format violations and incorrect parameters"""
    violations = []
    
    # Date patterns (case-insensitive)
    mdy_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b'
    dmy_pattern = r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
    
    # Find wrong date formats
    if required_format == 'dmy':
        for match in re.finditer(mdy_pattern, text, re.IGNORECASE):
            if not in_excluded_context(text, match.start()):
                violations.append(f"MDY date found: {match.group()}")
    elif required_format == 'mdy':
        for match in re.finditer(dmy_pattern, text, re.IGNORECASE):
            if not in_excluded_context(text, match.start()):
                violations.append(f"DMY date found: {match.group()}")

    # Check template parameters
    param_violations = check_template_params(text, required_format)
    violations.extend(param_violations)
    
    return violations

def fix_violations(text, required_format):
    """Core correction logic"""
    wikicode = mwparserfromhell.parse(text)
    
    # 1. Fix date formats
    if required_format == 'dmy':
        text = re.sub(
            r'\b([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})\b',
            r'\2 \1 \3',  # Convert to DMY
            text,
            flags=re.IGNORECASE
        )
    else:
        text = re.sub(
            r'\b(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\b',
            r'\2 \1, \3',  # Convert to MDY
            text,
            flags=re.IGNORECASE
        )
    
    # 2. Fix parameters
    wikicode = mwparserfromhell.parse(text)
    for template in wikicode.filter_templates():
        to_remove = []
        for param in template.params:
            pname = param.name.strip().lower()
            if required_format == 'dmy' and pname == 'mf':
                param.name = "df"  # Replace mf with df
            elif required_format == 'mdy' and pname == 'df':
                param.name = "mf"  # Replace df with mf
        for name in to_remove:
            template.remove(name)
    
    return str(wikicode)

def check_template_params(text, required_format):
    """Parameter checker with replacement"""
    violations = []
    wikicode = mwparserfromhell.parse(text)
    
    for template in wikicode.filter_templates():
        for param in template.params:
            pname = param.name.strip().lower()
            if required_format == 'dmy' and pname == 'mf':
                violations.append(f"|mf=y in {template.name} (should be df=y)")
            elif required_format == 'mdy' and pname == 'df':
                violations.append(f"|df=y in {template.name} (should be mf=y)")
    
    return violations

def process_page(page):
    """Process a single page"""
    try:
        print(f"\nProcessing: [[{page.title()}]]")
        
        # Skip if page doesn't meet criteria
        if not page.exists() or page.isRedirectPage():
            return

        # Get required format
        required_format = get_required_format(page)
        if not required_format:
            return

        # Find violations
        violations = find_format_violations(page.text, required_format)
        if not violations:
            return

        print(f"Found {len(violations)} violations")
        fixed_text = fix_violations(page.text, required_format)
        
        # Show diff
        diff = unified_diff(
            page.text.split('\n'),
            fixed_text.split('\n'),
            fromfile='Original',
            tofile='Corrected',
            n=3
        )
        print('\n'.join(list(diff)[:15]))

        # Manual confirmation
        if input("Apply changes? (y/n) ").strip().lower() != 'y':
            return

        # Save changes
        try:
            page.text = fixed_text
            page.save(
                summary=f"Bot: Fix {len(violations)} date issues",
                botflag=False,  # Set to True after approval
                minor=False,
                asynchronous=True  # Better for batch edits
            )
            print(f"Successfully edited [[{page.title()}]]")
        except Exception as e:
            print(f"Error: {str(e)}")

    except Exception as e:
        print(f"Error: {str(e)}")
    finally:
        time.sleep(10)  # Rate limit

def get_category_pages(resume_from=None, limit=50):
    """Version with debugging"""
    try:
        # Initialize
        site = pywikibot.Site("en", "wikipedia")
        print(f"Connected to {site} at {datetime.now()}")
        
        # State tracking
        state_file = "bot_progress.txt"
        processed = set()
        if os.path.exists(state_file):
            with open(state_file) as f:
                processed = set(line.strip() for line in f)
            print(f"Loaded {len(processed)} previously processed pages")

        # Categories to process
        categories = [
            "Category:Use dmy dates",
            "Category:Use mdy dates"
        ]
        
        count = 0
        for cat_name in categories:
            print(f"\nProcessing {cat_name}...")
            category = pywikibot.Category(site, cat_name)
            
            if not category.exists():
                print(f"Category {cat_name} doesn't exist!")
                continue
                
            gen = pywikibot.pagegenerators.CategorizedPageGenerator(
                category,
                recurse=True,
                namespaces=[0],
            )
            
            for page in gen:
                print(f"  Evaluating: {page.title()}")
                
                if page.title() not in processed:
                    if resume_from and page.title() != resume_from:
                        continue
                    resume_from = None
                    
                    yield page
                    count += 1
                    
                    with open(state_file, 'a') as f:
                        f.write(f"{page.title()}\n")
                    
                    if limit and count >= limit:
                        print(f"Reached limit of {limit} pages")
                        return

    except Exception as e:
        print(f"Error in get_category_pages: {str(e)}")
        raise

print("Starting bot run...")
try:
    for i, page in enumerate(get_category_pages(limit=5)):  # Test with 5 first; will be changed later
        print(f"\nProcessing page {i+1}: {page.title()}")
        process_page(page)
except KeyboardInterrupt:
    print("\nBot stopped by user")
except Exception as e:
    print(f"Bot crashed: {str(e)}")
finally:
    print("Bot run completed")