User:PharyngealImplosive7/IPAlinker.py
Appearance
Description
This contains old code for PharyngealBOT task 2. It is NOT the current code and is retained for historical reasons. To see that, go to the github repository. – PharyngealImplosive7 (talk) 00:00, 1 June 2025 (UTC)
Actual code
import pywikibot
from mwparserfromhell import parse, nodes
import urllib.parse
import re
import unicodedata
from dataclasses import dataclass
from typing import Set, List, Tuple, Optional, Dict, Any
from functools import lru_cache
import yaml
import sys
from pathlib import Path
# Pre-intern common strings
_STRINGS = {s: sys.intern(s) for s in ['ipa', 'separator', 'IPA link']}
@dataclass
class ProcessingStats:
__slots__ = ['changes', 'processed_count', 'modified_count', 'skipped_count']
def __init__(self):
self.changes = 0
self.processed_count = 0
self.modified_count = 0
self.skipped_count = 0
class IPAProcessor:
"""Optimized IPA processor with combined functionality."""
def __init__(self, config_path: str = "ipa_config.yaml"):
self.config_path = config_path
self._load_config()
self._compile_patterns()
self._build_char_maps()
self.stats = ProcessingStats()
print(f"IPA Processor initialized with config: {config_path}")
def _load_config(self):
"""Load and cache configuration."""
config_file = Path(self.config_path)
if not config_file.exists():
raise FileNotFoundError(f"Configuration file not found: {self.config_path}")
with open(config_file, 'r', encoding='utf-8') as f:
c = yaml.safe_load(f)
# Store config values directly as attributes for faster access
self.max_ipa_length = c.get('max_ipa_length', 1)
self.brackets = c.get('brackets', {})
self.brackets_to_remove = frozenset(c.get('brackets_to_remove', []))
self.multi_char_exceptions = frozenset(c.get('multi_char_exceptions', []))
self.vowel_symbols = frozenset(c.get('vowel_symbols', []))
self.tone_symbols = frozenset(c.get('tone_symbols', []))
self.click_symbols = frozenset(c.get('click_symbols', []))
self.diacritics = frozenset(c.get('diacritics', []))
self.non_ipa_diacritics = frozenset(c.get('non_ipa_diacritics', []))
self.allophone_separators = frozenset(c.get('allophone_separators', []))
self.separator_symbols = frozenset(c.get('separator_symbols', []))
self.phonetic_terms = frozenset(term.lower() for term in c.get('phonetic_terms', []))
self.tiebar = sys.intern(c.get('tiebar', '͡'))
self.ejective_marker = sys.intern(c.get('ejective_marker', 'ʼ'))
# Process IPA brackets
ipa_brackets_raw = c.get('ipa_brackets', {})
self.ipa_brackets = {}
for open_b, bracket_data in ipa_brackets_raw.items():
self.ipa_brackets[sys.intern(open_b)] = (
sys.intern(bracket_data[0]),
sys.intern(bracket_data[1])
)
self._superscript_pattern = c.get('superscript_regex_pattern',
r"[\u02B0-\u02FF\u1D00-\u1DBF\u2070-\u209F\u1D2C-\u1D6B]")
def _compile_patterns(self):
"""Compile all regex patterns once."""
self.superscript_regex = re.compile(self._superscript_pattern)
separator_pattern = '|'.join(re.escape(sep) for sep in self.separator_symbols)
self.separator_split_pattern = re.compile(rf'(\s*)({separator_pattern})(\s*)')
self.table_pattern = re.compile(r'(?:^|\n)\s*:{0,4}\s*\{\|.*?\n\|\}', re.MULTILINE | re.DOTALL)
self.word_pattern = re.compile(r'\b\w+\b')
self.space_pattern = re.compile(r'(\s+)')
def _build_char_maps(self):
"""Build efficient character classification maps."""
self.char_types = {}
for char in self.diacritics:
self.char_types[char] = 'diacritic'
for char in self.tone_symbols:
self.char_types[char] = 'tone'
for char in self.vowel_symbols:
self.char_types[char] = 'vowel'
for char in self.click_symbols:
self.char_types[char] = 'click'
for char in self.non_ipa_diacritics:
self.char_types[char] = 'non_ipa'
@lru_cache(maxsize=1000)
def analyze_segment(self, segment: str) -> Dict[str, Any]:
"""Single-pass segment analysis with caching."""
seg_clean = ''.join(c for c in segment if c not in self.brackets_to_remove and not c.isspace())
normalized = unicodedata.normalize('NFD', seg_clean)
has_tone = any(c in self.tone_symbols for c in segment) or segment in self.tone_symbols
has_ejective = self.ejective_marker in segment
has_diacritic = any(d in normalized for d in self.diacritics) or bool(self.superscript_regex.search(seg_clean))
has_non_ipa = any(c in self.non_ipa_diacritics for c in normalized)
has_tiebar = self.tiebar in normalized
has_click = any(click in normalized for click in self.click_symbols)
# Count vowels for diphthong detection
clean_for_vowels = ''.join(c for c in seg_clean
if c not in self.diacritics and not self.superscript_regex.match(c))
vowel_count = sum(1 for c in clean_for_vowels.lower() if c in self.vowel_symbols)
is_diphthong = vowel_count >= 2
# Check if ejective consonant
is_ejective_consonant = False
if has_ejective:
base = segment.replace(self.ejective_marker, '')
is_ejective_consonant = (len(base.strip()) == 1 and
base.lower() not in self.vowel_symbols and base.strip())
# Determine if special segment
is_special = (seg_clean.lower() in self.multi_char_exceptions or
has_diacritic or has_tiebar or has_click or
is_ejective_consonant or len(seg_clean.strip()) == 1)
# Should link determination
should_link = (not has_tone and not is_diphthong and not has_non_ipa and
seg_clean.strip() and (is_ejective_consonant or is_special))
return {
'has_tone': has_tone,
'is_diphthong': is_diphthong,
'has_non_ipa': has_non_ipa,
'is_ejective_consonant': is_ejective_consonant,
'should_link': should_link,
'clean': seg_clean
}
def detect_ipa_brackets(self, segment: str) -> Tuple[Optional[str], Optional[str], str, Optional[str]]:
"""Fast bracket detection."""
segment = segment.strip()
# Check IPA-specific brackets first
for open_b, (close_b, template_name) in self.ipa_brackets.items():
if segment.startswith(open_b) and segment.endswith(close_b):
content = segment[len(open_b):-len(close_b)].strip()
return open_b, close_b, content, template_name
# Fallback to regular brackets
for open_b, close_b in self.brackets.items():
if segment.startswith(open_b) and segment.endswith(close_b):
content = segment[len(open_b):-len(close_b)].strip()
return open_b, close_b, content, None
return None, None, segment, None
def tokenize_content(self, content: str) -> List:
"""Fast tokenization with single regex split."""
result = []
parts = self.separator_split_pattern.split(content)
for part in parts:
if not part:
continue
if part.strip() in self.separator_symbols:
result.append((_STRINGS['separator'], part.strip(), ''))
elif part.isspace():
continue
else:
if part.strip():
space_parts = self.space_pattern.split(part)
for space_part in space_parts:
if space_part.strip():
result.append(space_part.strip())
elif space_part.isspace():
result.append(space_part)
return result
def contains_phonetic_terms(self, text: str, min_terms: int = 3) -> Tuple[bool, List[str]]:
"""Fast phonetic term detection."""
words = set(self.word_pattern.findall(text.lower()))
matched = [w for w in words if w in self.phonetic_terms]
return len(matched) >= min_terms, matched[:min_terms] if matched else []
def find_tables(self, text: str) -> List[Tuple[int, int, str]]:
"""Find table boundaries."""
return [(m.start(), m.end(), m.group()) for m in self.table_pattern.finditer(text)]
def is_in_table(self, pos: int, tables: List[Tuple[int, int, str]]) -> Tuple[bool, Optional[str]]:
"""Check if position is in any table."""
for start, end, content in tables:
if start <= pos <= end:
return True, content
return False, None
def process_ipa_template(self, node: nodes.Template, parent_list: List, index: int) -> None:
"""Process IPA template with optimized logic."""
if node.name.strip().lower() != _STRINGS['ipa']:
return
raw_content = str(node.params[0].value).strip()
open_b, close_b, inner_content, template_name = self.detect_ipa_brackets(raw_content)
# Handle IPA brackets with separators
if (open_b and close_b and template_name and
any(sep in inner_content for sep in self.separator_symbols)):
self._process_bracketed_allophones(node, parent_list, index, raw_content,
open_b, close_b, inner_content, template_name)
return
# Handle simple IPA brackets
if open_b and close_b and template_name:
self._process_simple_brackets(node, parent_list, index, raw_content,
open_b, close_b, inner_content, template_name)
return
# Fallback processing
segments = self.tokenize_content(raw_content)
if not any(isinstance(s, str) and not s.isspace() and
self.analyze_segment(s if isinstance(s, str) else s[1])['should_link']
for s in segments):
return
new_nodes = self._create_nodes(segments)
if new_nodes:
parent_list[index:index+1] = new_nodes
self.stats.changes += 1
print(f"Converted IPA template: {raw_content}")
def _process_simple_brackets(self, node, parent_list, index, raw_content,
open_b, close_b, inner_content, template_name):
"""Handle simple IPA brackets."""
if not inner_content.strip():
return
analysis = self.analyze_segment(inner_content)
if analysis['should_link']:
ipa_link = nodes.Template(name=template_name)
ipa_link.add("1", inner_content.strip())
parent_list[index:index+1] = [ipa_link]
self.stats.changes += 1
print(f"Converted: {raw_content} -> {{{{{template_name}|{inner_content.strip()}}}}}")
def _process_bracketed_allophones(self, node, parent_list, index, raw_content,
open_b, close_b, inner_content, template_name):
"""Handle bracketed allophone sets."""
segments = self.tokenize_content(inner_content)
new_nodes = []
for segment in segments:
if isinstance(segment, tuple) and len(segment) == 3:
new_nodes.append(nodes.Text(segment[1]))
if segment[2]:
new_nodes.append(nodes.Text(segment[2]))
continue
if isinstance(segment, str):
if segment.isspace():
new_nodes.append(nodes.Text(segment))
continue
if not segment.strip():
continue
analysis = self.analyze_segment(segment)
if analysis['should_link']:
ipa_link = nodes.Template(name=template_name)
ipa_link.add("1", segment.strip())
new_nodes.append(ipa_link)
else:
new_nodes.append(nodes.Text(segment))
if new_nodes:
parent_list[index:index+1] = new_nodes
self.stats.changes += 1
print(f"Converted allophone template: {raw_content}")
def _create_nodes(self, segments: List) -> List[nodes.Node]:
"""Create processed nodes efficiently."""
new_nodes = []
for segment in segments:
if isinstance(segment, tuple) and len(segment) == 3:
new_nodes.append(nodes.Text(segment[1]))
if segment[2]:
new_nodes.append(nodes.Text(segment[2]))
continue
if isinstance(segment, str) and segment.isspace():
new_nodes.append(nodes.Text(segment))
continue
open_b, close_b, core = self.detect_ipa_brackets(segment)
if not core.strip():
continue
analysis = self.analyze_segment(core)
if not analysis['should_link']:
return [] # Preserve original if any segment shouldn't be linked
if open_b:
new_nodes.append(nodes.Text(open_b))
ipa_link = nodes.Template(name=_STRINGS['IPA link'])
ipa_link.add("1", core)
new_nodes.append(ipa_link)
if close_b:
new_nodes.append(nodes.Text(close_b))
return new_nodes
def process_page(self, page: pywikibot.Page) -> bool:
"""Process single page efficiently."""
print(f"\nProcessing: {page.title()}")
try:
text = page.get()
except pywikibot.exceptions.NoPageError:
print("Page doesn't exist!")
return False
wikicode = parse(text)
self.stats.changes = 0
tables = self.find_tables(text)
self._process_nodes_with_context(wikicode.nodes, 0, tables)
if self.stats.changes:
new_text = str(wikicode)
print(f"\nFound {self.stats.changes} IPA conversion(s)")
pywikibot.showDiff(text, new_text)
if input("Save changes? (y/n): ").lower() == 'y':
page.text = new_text
page.save(summary=f"IPA conversion in phonetic tables ({self.stats.changes} templates)", bot=True)
return True
else:
print("No IPA templates needed conversion")
return False
def _process_nodes_with_context(self, node_list: List[nodes.Node], text_offset: int, tables: List):
"""Process nodes with table context."""
i = 0
current_offset = text_offset
while i < len(node_list):
node = node_list[i]
node_str = str(node)
in_table, table_content = self.is_in_table(current_offset, tables)
if isinstance(node, nodes.Template) and node.name.strip().lower() == _STRINGS['ipa']:
if in_table:
is_relevant, _ = self.contains_phonetic_terms(table_content, 3)
if is_relevant:
self.process_ipa_template(node, node_list, i)
elif isinstance(node, nodes.Tag) and hasattr(node, 'contents') and hasattr(node.contents, 'nodes'):
tag_start_len = len(f"<{node.tag}>")
self._process_nodes_with_context(node.contents.nodes, current_offset + tag_start_len, tables)
current_offset += len(node_str)
i += 1
def process_category(self, category_name: str, depth: int = 0,
max_pages: Optional[int] = None, skip_pages: int = 0) -> Tuple[int, int]:
"""Process category efficiently."""
site = pywikibot.Site('en', 'wikipedia')
cat = pywikibot.Category(site, f"Category:{category_name}")
print(f"\n=== Processing Category: {cat.title()} ===")
all_pages = list(cat.articles(recurse=depth))
pages = [page for page in all_pages if page.namespace() == 0]
print(f"Found {len(all_pages)} total pages, {len(pages)} article space pages")
if max_pages:
print(f"Will process up to {max_pages} pages")
if skip_pages:
print(f"Skipping first {skip_pages} pages")
self.stats.processed_count = 0
self.stats.modified_count = 0
self.stats.skipped_count = 0
for i, page in enumerate(pages):
if max_pages and self.stats.processed_count >= max_pages:
break
if self.stats.skipped_count < skip_pages:
self.stats.skipped_count += 1
continue
print(f"\n[{i+1}/{len(pages)}] Processing article: {page.title()}")
try:
if self.process_page(page):
self.stats.modified_count += 1
self.stats.processed_count += 1
except Exception as e:
print(f"Error processing page {page.title()}: {e}")
print(f"\n=== Category Processing Complete ===")
print(f"Processed {self.stats.processed_count} pages")
print(f"Made changes to {self.stats.modified_count} pages")
return self.stats.processed_count, self.stats.modified_count
def reload_config(self):
"""Reload configuration."""
self._load_config()
self._compile_patterns()
self._build_char_maps()
# Clear cache
self.analyze_segment.cache_clear()
print(f"Configuration reloaded from {self.config_path}")
def main():
"""Main entry point."""
# URL decoding workaround
if not hasattr(pywikibot, 'tools'):
pywikibot.tools = type('', (), {})()
pywikibot.tools.chars = type('', (), {})()
pywikibot.tools.chars.url2string = lambda text, encodings=None: urllib.parse.unquote(
text, encoding=(encodings or ['utf-8'])[0]
)
try:
site = pywikibot.Site('en', 'wikipedia')
if site.logged_in():
print(f"Successfully logged in as: {site.username()}")
else:
print("Not logged in. Please check your authentication.")
return
config_path = input("Enter config file path (or press Enter for default 'ipa_config.yaml'): ").strip()
processor = IPAProcessor(config_path or "ipa_config.yaml")
while True:
print("\nOptions:")
print("1. Process a specific page")
print("2. Process a category")
print("3. Reload configuration")
print("4. Exit")
choice = input("Enter your choice (1-4): ").strip()
if choice == '1':
page_title = input("Enter page title: ").strip()
page = pywikibot.Page(site, page_title)
processor.process_page(page)
elif choice == '2':
category_name = input("Enter category name (without 'Category:' prefix): ").strip()
depth = int(input("Enter recursion depth (0 for just this category): ").strip() or "0")
max_pages_str = input("Enter maximum pages (or enter for no limit): ").strip()
max_pages = int(max_pages_str) if max_pages_str else None
skip_pages = int(input("Enter pages to skip (or enter for none): ").strip() or "0")
processor.process_category(category_name, depth, max_pages, skip_pages)
elif choice == '3':
processor.reload_config()
elif choice == '4':
print("Exiting program.")
break
else:
print("Invalid choice. Please enter 1, 2, 3, or 4.")
except KeyboardInterrupt:
print("\nOperation interrupted by user.")
except Exception as e:
print(f"An error occurred: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()