Jump to content

User:PharyngealImplosive7/Date-checker.py

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
import pywikibot
import mwparserfromhell
import re
from pywikibot import pagegenerators
from difflib import unified_diff
import time
from pywikibot.exceptions import EditConflictError, PageSaveRelatedError
from itertools import chain
import os
from datetime import datetime, timedelta

def get_required_format(page):
    """Check template age"""
    # Basic validation
    if not isinstance(page, pywikibot.Page) or not page.exists():
        return None

    # Skip very new pages (<30 days old)
    page_age = (datetime.now() - page.oldest_revision.timestamp).days
    if page_age < 30:
        print(f"Skipping new page: {page.title()} (created {page_age} days ago)")
        return None

    # Parse current content
    wikicode = mwparserfromhell.parse(page.text)
    date_templates = ['use dmy dates', 'use mdy dates']
    current_templates = []
    
    for template in wikicode.filter_templates():
        template_name = template.name.strip().lower()
        if template_name in date_templates:
            current_templates.append(template_name)
    
    if not current_templates:
        return None
    
    # Verify template exists in last 5 revisions
    try:
        for rev in page.revisions(total=5):
            if 'use dmy dates' in rev.text.lower() or 'use mdy dates' in rev.text.lower():
                return 'dmy' if 'use dmy dates' in current_templates else 'mdy'
    except Exception as e:
        print(f"Error checking revisions for {page.title()}: {str(e)}")
        return None
    
    return None

def handle_edit_conflict(page, fixed_text, violations, retries=3):
    """Handle edit conflicts by retrying"""
    for attempt in range(retries):
        try:
            page.text = fixed_text
            page.save(
                summary=f"PharyngealBOT: Fix {len(violations)} date issues (attempt {attempt+1})",
                bot=False, # Make true when I get approved
                minor=False,
                conflict=attempt+1  # Pywikibot's built-in conflict handling
            )
            return True
        except EditConflictError:
            print(f"Edit conflict detected on {page.title()}, retrying...")
            time.sleep(30 * (attempt+1))
        except PageSaveRelatedError as e:
            print(f"Save error: {str(e)}")
            return False
    return False

def in_excluded_context(text, position):
    """Check if position is within wikilink, template, or reference"""
    # Track all opening/closing tags before the position
    stack = []
    i = 0
    
    while i <= position and i < len(text):
        # Check for opening tags
        if text.startswith('[[', i):
            stack.append(('[[', i))
            i += 2
        elif text.startswith('{{', i):
            stack.append(('{{', i))
            i += 2
        elif text.startswith('<ref', i) and (text[i+4].isspace() or text[i+4] == '>'):
            stack.append(('<ref', i))
            i += 4
        # Check for closing tags
        elif text.startswith(']]', i) and stack and stack[-1][0] == '[[':
            start_type, start_pos = stack.pop()
            if start_pos < position < i:
                return True
            i += 2
        elif text.startswith('}}', i) and stack and stack[-1][0] == '{{':
            start_type, start_pos = stack.pop()
            if start_pos < position < i:
                return True
            i += 2
        elif text.startswith('</ref>', i) and stack and stack[-1][0] == '<ref':
            start_type, start_pos = stack.pop()
            if start_pos < position < i:
                return True
            i += 6
        else:
            i += 1
    
    return False

def find_format_violations(text, required_format):
    """Find all MDY/DMY format violations and incorrect parameters"""
    violations = []
    
    # Date patterns (case-insensitive)
    mdy_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b'
    dmy_pattern = r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
    
    # Find wrong date formats
    if required_format == 'dmy':
        for match in re.finditer(mdy_pattern, text, re.IGNORECASE):
            if not in_excluded_context(text, match.start()):
                violations.append(f"MDY date found: {match.group()}")
    elif required_format == 'mdy':
        for match in re.finditer(dmy_pattern, text, re.IGNORECASE):
            if not in_excluded_context(text, match.start()):
                violations.append(f"DMY date found: {match.group()}")

    # Check template parameters
    param_violations = check_template_params(text, required_format)
    violations.extend(param_violations)
    
    return violations

def fix_violations(text, required_format):
    """Core correction logic"""
    wikicode = mwparserfromhell.parse(text)
    
    # 1. Fix date formats
    if required_format == 'dmy':
        text = re.sub(
            r'\b([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})\b',
            r'\2 \1 \3',  # Convert to DMY
            text,
            flags=re.IGNORECASE
        )
    else:
        text = re.sub(
            r'\b(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\b',
            r'\2 \1, \3',  # Convert to MDY
            text,
            flags=re.IGNORECASE
        )
    
    # 2. Fix parameters
    wikicode = mwparserfromhell.parse(text)
    for template in wikicode.filter_templates():
        to_remove = []
        for param in template.params:
            pname = param.name.strip().lower()
            if required_format == 'dmy' and pname == 'mf':
                param.name = "df"  # Replace mf with df
            elif required_format == 'mdy' and pname == 'df':
                param.name = "mf"  # Replace df with mf
        for name in to_remove:
            template.remove(name)
    
    return str(wikicode)

def check_template_params(text, required_format):
    """Parameter checker with replacement"""
    violations = []
    wikicode = mwparserfromhell.parse(text)
    
    for template in wikicode.filter_templates():
        for param in template.params:
            pname = param.name.strip().lower()
            if required_format == 'dmy' and pname == 'mf':
                violations.append(f"|mf=y in {template.name} (should be df=y)")
            elif required_format == 'mdy' and pname == 'df':
                violations.append(f"|df=y in {template.name} (should be mf=y)")
    
    return violations

def process_page(page):
    """Process a single page"""
    try:
        print(f"\nProcessing: [[{page.title()}]]")
        
        # Skip if page doesn't meet criteria
        if not page.exists() or page.isRedirectPage():
            return

        # Get required format
        required_format = get_required_format(page)
        if not required_format:
            return

        # Find violations
        violations = find_format_violations(page.text, required_format)
        if not violations:
            return

        print(f"Found {len(violations)} violations")
        fixed_text = fix_violations(page.text, required_format)
        
        # Show diff
        diff = unified_diff(
            page.text.split('\n'),
            fixed_text.split('\n'),
            fromfile='Original',
            tofile='Corrected',
            n=3
        )
        print('\n'.join(list(diff)[:15]))

        # Manual confirmation
        if input("Apply changes? (y/n) ").strip().lower() != 'y':
            return

        # Save changes
        try:
            page.text = fixed_text
            page.save(
                summary=f"Bot: Fix {len(violations)} date issues",
                botflag=False,  # Set to True after approval
                minor=False,
                asynchronous=True  # Better for batch edits
            )
            print(f"Successfully edited [[{page.title()}]]")
        except Exception as e:
            print(f"Error: {str(e)}")

    except Exception as e:
        print(f"Error: {str(e)}")
    finally:
        time.sleep(10)  # Rate limit

def get_category_pages(resume_from=None, limit=50):
    """Version with debugging"""
    try:
        # Initialize
        site = pywikibot.Site("en", "wikipedia")
        print(f"Connected to {site} at {datetime.now()}")
        
        # State tracking
        state_file = "bot_progress.txt"
        processed = set()
        if os.path.exists(state_file):
            with open(state_file) as f:
                processed = set(line.strip() for line in f)
            print(f"Loaded {len(processed)} previously processed pages")

        # Categories to process
        categories = [
            "Category:Use dmy dates",
            "Category:Use mdy dates"
        ]
        
        count = 0
        for cat_name in categories:
            print(f"\nProcessing {cat_name}...")
            category = pywikibot.Category(site, cat_name)
            
            if not category.exists():
                print(f"Category {cat_name} doesn't exist!")
                continue
                
            gen = pywikibot.pagegenerators.CategorizedPageGenerator(
                category,
                recurse=True,
                namespaces=[0],
            )
            
            for page in gen:
                print(f"  Evaluating: {page.title()}")
                
                if page.title() not in processed:
                    if resume_from and page.title() != resume_from:
                        continue
                    resume_from = None
                    
                    yield page
                    count += 1
                    
                    with open(state_file, 'a') as f:
                        f.write(f"{page.title()}\n")
                    
                    if limit and count >= limit:
                        print(f"Reached limit of {limit} pages")
                        return

    except Exception as e:
        print(f"Error in get_category_pages: {str(e)}")
        raise

print("Starting bot run...")
try:
    for i, page in enumerate(get_category_pages(limit=5)):  # Test with 5 first; will be changed later
        print(f"\nProcessing page {i+1}: {page.title()}")
        process_page(page)
except KeyboardInterrupt:
    print("\nBot stopped by user")
except Exception as e:
    print(f"Bot crashed: {str(e)}")
finally:
    print("Bot run completed")