User:PharyngealImplosive7/Date-checker.py
Appearance
import pywikibot
import mwparserfromhell
import re
from pywikibot import pagegenerators
from difflib import unified_diff
import time
from pywikibot.exceptions import EditConflictError, PageSaveRelatedError
from itertools import chain
import os
from datetime import datetime, timedelta
def get_required_format(page):
"""Check template age"""
# Basic validation
if not isinstance(page, pywikibot.Page) or not page.exists():
return None
# Skip very new pages (<30 days old)
page_age = (datetime.now() - page.oldest_revision.timestamp).days
if page_age < 30:
print(f"Skipping new page: {page.title()} (created {page_age} days ago)")
return None
# Parse current content
wikicode = mwparserfromhell.parse(page.text)
date_templates = ['use dmy dates', 'use mdy dates']
current_templates = []
for template in wikicode.filter_templates():
template_name = template.name.strip().lower()
if template_name in date_templates:
current_templates.append(template_name)
if not current_templates:
return None
# Verify template exists in last 5 revisions
try:
for rev in page.revisions(total=5):
if 'use dmy dates' in rev.text.lower() or 'use mdy dates' in rev.text.lower():
return 'dmy' if 'use dmy dates' in current_templates else 'mdy'
except Exception as e:
print(f"Error checking revisions for {page.title()}: {str(e)}")
return None
return None
def handle_edit_conflict(page, fixed_text, violations, retries=3):
"""Handle edit conflicts by retrying"""
for attempt in range(retries):
try:
page.text = fixed_text
page.save(
summary=f"PharyngealBOT: Fix {len(violations)} date issues (attempt {attempt+1})",
bot=False, # Make true when I get approved
minor=False,
conflict=attempt+1 # Pywikibot's built-in conflict handling
)
return True
except EditConflictError:
print(f"Edit conflict detected on {page.title()}, retrying...")
time.sleep(30 * (attempt+1))
except PageSaveRelatedError as e:
print(f"Save error: {str(e)}")
return False
return False
def in_excluded_context(text, position):
"""Check if position is within wikilink, template, or reference"""
# Track all opening/closing tags before the position
stack = []
i = 0
while i <= position and i < len(text):
# Check for opening tags
if text.startswith('[[', i):
stack.append(('[[', i))
i += 2
elif text.startswith('{{', i):
stack.append(('{{', i))
i += 2
elif text.startswith('<ref', i) and (text[i+4].isspace() or text[i+4] == '>'):
stack.append(('<ref', i))
i += 4
# Check for closing tags
elif text.startswith(']]', i) and stack and stack[-1][0] == '[[':
start_type, start_pos = stack.pop()
if start_pos < position < i:
return True
i += 2
elif text.startswith('}}', i) and stack and stack[-1][0] == '{{':
start_type, start_pos = stack.pop()
if start_pos < position < i:
return True
i += 2
elif text.startswith('</ref>', i) and stack and stack[-1][0] == '<ref':
start_type, start_pos = stack.pop()
if start_pos < position < i:
return True
i += 6
else:
i += 1
return False
def find_format_violations(text, required_format):
"""Find all MDY/DMY format violations and incorrect parameters"""
violations = []
# Date patterns (case-insensitive)
mdy_pattern = r'\b(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{1,2},?\s+\d{4}\b'
dmy_pattern = r'\b\d{1,2}\s+(?:January|February|March|April|May|June|July|August|September|October|November|December)\s+\d{4}\b'
# Find wrong date formats
if required_format == 'dmy':
for match in re.finditer(mdy_pattern, text, re.IGNORECASE):
if not in_excluded_context(text, match.start()):
violations.append(f"MDY date found: {match.group()}")
elif required_format == 'mdy':
for match in re.finditer(dmy_pattern, text, re.IGNORECASE):
if not in_excluded_context(text, match.start()):
violations.append(f"DMY date found: {match.group()}")
# Check template parameters
param_violations = check_template_params(text, required_format)
violations.extend(param_violations)
return violations
def fix_violations(text, required_format):
"""Core correction logic"""
wikicode = mwparserfromhell.parse(text)
# 1. Fix date formats
if required_format == 'dmy':
text = re.sub(
r'\b([A-Za-z]+)\s+(\d{1,2}),?\s+(\d{4})\b',
r'\2 \1 \3', # Convert to DMY
text,
flags=re.IGNORECASE
)
else:
text = re.sub(
r'\b(\d{1,2})\s+([A-Za-z]+)\s+(\d{4})\b',
r'\2 \1, \3', # Convert to MDY
text,
flags=re.IGNORECASE
)
# 2. Fix parameters
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
to_remove = []
for param in template.params:
pname = param.name.strip().lower()
if required_format == 'dmy' and pname == 'mf':
param.name = "df" # Replace mf with df
elif required_format == 'mdy' and pname == 'df':
param.name = "mf" # Replace df with mf
for name in to_remove:
template.remove(name)
return str(wikicode)
def check_template_params(text, required_format):
"""Parameter checker with replacement"""
violations = []
wikicode = mwparserfromhell.parse(text)
for template in wikicode.filter_templates():
for param in template.params:
pname = param.name.strip().lower()
if required_format == 'dmy' and pname == 'mf':
violations.append(f"|mf=y in {template.name} (should be df=y)")
elif required_format == 'mdy' and pname == 'df':
violations.append(f"|df=y in {template.name} (should be mf=y)")
return violations
def process_page(page):
"""Process a single page"""
try:
print(f"\nProcessing: [[{page.title()}]]")
# Skip if page doesn't meet criteria
if not page.exists() or page.isRedirectPage():
return
# Get required format
required_format = get_required_format(page)
if not required_format:
return
# Find violations
violations = find_format_violations(page.text, required_format)
if not violations:
return
print(f"Found {len(violations)} violations")
fixed_text = fix_violations(page.text, required_format)
# Show diff
diff = unified_diff(
page.text.split('\n'),
fixed_text.split('\n'),
fromfile='Original',
tofile='Corrected',
n=3
)
print('\n'.join(list(diff)[:15]))
# Manual confirmation
if input("Apply changes? (y/n) ").strip().lower() != 'y':
return
# Save changes
try:
page.text = fixed_text
page.save(
summary=f"Bot: Fix {len(violations)} date issues",
botflag=False, # Set to True after approval
minor=False,
asynchronous=True # Better for batch edits
)
print(f"Successfully edited [[{page.title()}]]")
except Exception as e:
print(f"Error: {str(e)}")
except Exception as e:
print(f"Error: {str(e)}")
finally:
time.sleep(10) # Rate limit
def get_category_pages(resume_from=None, limit=50):
"""Version with debugging"""
try:
# Initialize
site = pywikibot.Site("en", "wikipedia")
print(f"Connected to {site} at {datetime.now()}")
# State tracking
state_file = "bot_progress.txt"
processed = set()
if os.path.exists(state_file):
with open(state_file) as f:
processed = set(line.strip() for line in f)
print(f"Loaded {len(processed)} previously processed pages")
# Categories to process
categories = [
"Category:Use dmy dates",
"Category:Use mdy dates"
]
count = 0
for cat_name in categories:
print(f"\nProcessing {cat_name}...")
category = pywikibot.Category(site, cat_name)
if not category.exists():
print(f"Category {cat_name} doesn't exist!")
continue
gen = pywikibot.pagegenerators.CategorizedPageGenerator(
category,
recurse=True,
namespaces=[0],
)
for page in gen:
print(f" Evaluating: {page.title()}")
if page.title() not in processed:
if resume_from and page.title() != resume_from:
continue
resume_from = None
yield page
count += 1
with open(state_file, 'a') as f:
f.write(f"{page.title()}\n")
if limit and count >= limit:
print(f"Reached limit of {limit} pages")
return
except Exception as e:
print(f"Error in get_category_pages: {str(e)}")
raise
print("Starting bot run...")
try:
for i, page in enumerate(get_category_pages(limit=5)): # Test with 5 first; will be changed later
print(f"\nProcessing page {i+1}: {page.title()}")
process_page(page)
except KeyboardInterrupt:
print("\nBot stopped by user")
except Exception as e:
print(f"Bot crashed: {str(e)}")
finally:
print("Bot run completed")