User:PharyngealImplosive7/IPAlinker.py
Appearance
Description
[edit]This contains old code for PharyngealBOT task 2. It is NOT the current code and is retained for historical reasons. To see that, go to the github repository. – PharyngealImplosive7 (talk) 00:00, 1 June 2025 (UTC)
Actual code
[edit]import pywikibot
from mwparserfromhell import parse, nodes
import urllib.parse
import re
import unicodedata
from dataclasses import dataclass
from typing import Set, List, Tuple, Optional, Dict, Any
from functools import lru_cache
import yaml
import sys
from pathlib import Path
# Pre-intern common strings
_STRINGS = {s: sys.intern(s) for s in ['ipa', 'separator', 'IPA link']}
@dataclass
class ProcessingStats:
__slots__ = ['changes', 'processed_count', 'modified_count', 'skipped_count']
def __init__(self):
self.changes = 0
self.processed_count = 0
self.modified_count = 0
self.skipped_count = 0
class IPAProcessor:
"""Optimized IPA processor with combined functionality."""
def __init__(self, config_path: str = "ipa_config.yaml"):
self.config_path = config_path
self._load_config()
self._compile_patterns()
self._build_char_maps()
self.stats = ProcessingStats()
print(f"IPA Processor initialized with config: {config_path}")
def _load_config(self):
"""Load and cache configuration."""
config_file = Path(self.config_path)
if not config_file.exists():
raise FileNotFoundError(f"Configuration file not found: {self.config_path}")
with open(config_file, 'r', encoding='utf-8') as f:
c = yaml.safe_load(f)
# Store config values directly as attributes for faster access
self.max_ipa_length = c.get('max_ipa_length', 1)
self.brackets = c.get('brackets', {})
self.brackets_to_remove = frozenset(c.get('brackets_to_remove', []))
self.multi_char_exceptions = frozenset(c.get('multi_char_exceptions', []))
self.vowel_symbols = frozenset(c.get('vowel_symbols', []))
self.tone_symbols = frozenset(c.get('tone_symbols', []))
self.click_symbols = frozenset(c.get('click_symbols', []))
self.diacritics = frozenset(c.get('diacritics', []))
self.non_ipa_diacritics = frozenset(c.get('non_ipa_diacritics', []))
self.allophone_separators = frozenset(c.get('allophone_separators', []))
self.separator_symbols = frozenset(c.get('separator_symbols', []))
self.phonetic_terms = frozenset(term.lower() for term in c.get('phonetic_terms', []))
self.tiebar = sys.intern(c.get('tiebar', '͡'))
self.ejective_marker = sys.intern(c.get('ejective_marker', 'ʼ'))
# Process IPA brackets
ipa_brackets_raw = c.get('ipa_brackets', {})
self.ipa_brackets = {}
for open_b, bracket_data in ipa_brackets_raw.items():
self.ipa_brackets[sys.intern(open_b)] = (
sys.intern(bracket_data[0]),
sys.intern(bracket_data[1])
)
self._superscript_pattern = c.get('superscript_regex_pattern',
r"[\u02B0-\u02FF\u1D00-\u1DBF\u2070-\u209F\u1D2C-\u1D6B]")
def _compile_patterns(self):
"""Compile all regex patterns once."""
self.superscript_regex = re.compile(self._superscript_pattern)
separator_pattern = '|'.join(re.escape(sep) for sep in self.separator_symbols)
self.separator_split_pattern = re.compile(rf'(\s*)({separator_pattern})(\s*)')
self.table_pattern = re.compile(r'(?:^|\n)\s*:{0,4}\s*\{\|.*?\n\|\}', re.MULTILINE | re.DOTALL)
self.word_pattern = re.compile(r'\b\w+\b')
self.space_pattern = re.compile(r'(\s+)')
def _build_char_maps(self):
"""Build efficient character classification maps."""
self.char_types = {}
for char in self.diacritics:
self.char_types[char] = 'diacritic'
for char in self.tone_symbols:
self.char_types[char] = 'tone'
for char in self.vowel_symbols:
self.char_types[char] = 'vowel'
for char in self.click_symbols:
self.char_types[char] = 'click'
for char in self.non_ipa_diacritics:
self.char_types[char] = 'non_ipa'
@lru_cache(maxsize=1000)
def analyze_segment(self, segment: str) -> Dict[str, Any]:
"""Single-pass segment analysis with caching."""
seg_clean = ''.join(c for c in segment if c not in self.brackets_to_remove and not c.isspace())
normalized = unicodedata.normalize('NFD', seg_clean)
has_tone = any(c in self.tone_symbols for c in segment) or segment in self.tone_symbols
has_ejective = self.ejective_marker in segment
has_diacritic = any(d in normalized for d in self.diacritics) or bool(self.superscript_regex.search(seg_clean))
has_non_ipa = any(c in self.non_ipa_diacritics for c in normalized)
has_tiebar = self.tiebar in normalized
has_click = any(click in normalized for click in self.click_symbols)
# Count vowels for diphthong detection
clean_for_vowels = ''.join(c for c in seg_clean
if c not in self.diacritics and not self.superscript_regex.match(c))
vowel_count = sum(1 for c in clean_for_vowels.lower() if c in self.vowel_symbols)
is_diphthong = vowel_count >= 2
# Check if ejective consonant
is_ejective_consonant = False
if has_ejective:
base = segment.replace(self.ejective_marker, '')
is_ejective_consonant = (len(base.strip()) == 1 and
base.lower() not in self.vowel_symbols and base.strip())
# Determine if special segment
is_special = (seg_clean.lower() in self.multi_char_exceptions or
has_diacritic or has_tiebar or has_click or
is_ejective_consonant or len(seg_clean.strip()) == 1)
# Should link determination
should_link = (not has_tone and not is_diphthong and not has_non_ipa and
seg_clean.strip() and (is_ejective_consonant or is_special))
return {
'has_tone': has_tone,
'is_diphthong': is_diphthong,
'has_non_ipa': has_non_ipa,
'is_ejective_consonant': is_ejective_consonant,
'should_link': should_link,
'clean': seg_clean
}
def detect_ipa_brackets(self, segment: str) -> Tuple[Optional[str], Optional[str], str, Optional[str]]:
"""Fast bracket detection."""
segment = segment.strip()
# Check IPA-specific brackets first
for open_b, (close_b, template_name) in self.ipa_brackets.items():
if segment.startswith(open_b) and segment.endswith(close_b):
content = segment[len(open_b):-len(close_b)].strip()
return open_b, close_b, content, template_name
# Fallback to regular brackets
for open_b, close_b in self.brackets.items():
if segment.startswith(open_b) and segment.endswith(close_b):
content = segment[len(open_b):-len(close_b)].strip()
return open_b, close_b, content, None
return None, None, segment, None
def tokenize_content(self, content: str) -> List:
"""Fast tokenization with single regex split."""
result = []
parts = self.separator_split_pattern.split(content)
for part in parts:
if not part:
continue
if part.strip() in self.separator_symbols:
result.append((_STRINGS['separator'], part.strip(), ''))
elif part.isspace():
continue
else:
if part.strip():
space_parts = self.space_pattern.split(part)
for space_part in space_parts:
if space_part.strip():
result.append(space_part.strip())
elif space_part.isspace():
result.append(space_part)
return result
def contains_phonetic_terms(self, text: str, min_terms: int = 3) -> Tuple[bool, List[str]]:
"""Fast phonetic term detection."""
words = set(self.word_pattern.findall(text.lower()))
matched = [w for w in words if w in self.phonetic_terms]
return len(matched) >= min_terms, matched[:min_terms] if matched else []
def find_tables(self, text: str) -> List[Tuple[int, int, str]]:
"""Find table boundaries."""
return [(m.start(), m.end(), m.group()) for m in self.table_pattern.finditer(text)]
def is_in_table(self, pos: int, tables: List[Tuple[int, int, str]]) -> Tuple[bool, Optional[str]]:
"""Check if position is in any table."""
for start, end, content in tables:
if start <= pos <= end:
return True, content
return False, None
def process_ipa_template(self, node: nodes.Template, parent_list: List, index: int) -> None:
"""Process IPA template with optimized logic."""
if node.name.strip().lower() != _STRINGS['ipa']:
return
raw_content = str(node.params[0].value).strip()
open_b, close_b, inner_content, template_name = self.detect_ipa_brackets(raw_content)
# Handle IPA brackets with separators
if (open_b and close_b and template_name and
any(sep in inner_content for sep in self.separator_symbols)):
self._process_bracketed_allophones(node, parent_list, index, raw_content,
open_b, close_b, inner_content, template_name)
return
# Handle simple IPA brackets
if open_b and close_b and template_name:
self._process_simple_brackets(node, parent_list, index, raw_content,
open_b, close_b, inner_content, template_name)
return
# Fallback processing
segments = self.tokenize_content(raw_content)
if not any(isinstance(s, str) and not s.isspace() and
self.analyze_segment(s if isinstance(s, str) else s[1])['should_link']
for s in segments):
return
new_nodes = self._create_nodes(segments)
if new_nodes:
parent_list[index:index+1] = new_nodes
self.stats.changes += 1
print(f"Converted IPA template: {raw_content}")
def _process_simple_brackets(self, node, parent_list, index, raw_content,
open_b, close_b, inner_content, template_name):
"""Handle simple IPA brackets."""
if not inner_content.strip():
return
analysis = self.analyze_segment(inner_content)
if analysis['should_link']:
ipa_link = nodes.Template(name=template_name)
ipa_link.add("1", inner_content.strip())
parent_list[index:index+1] = [ipa_link]
self.stats.changes += 1
print(f"Converted: {raw_content} -> {{{{{template_name}|{inner_content.strip()}}}}}")
def _process_bracketed_allophones(self, node, parent_list, index, raw_content,
open_b, close_b, inner_content, template_name):
"""Handle bracketed allophone sets."""
segments = self.tokenize_content(inner_content)
new_nodes = []
for segment in segments:
if isinstance(segment, tuple) and len(segment) == 3:
new_nodes.append(nodes.Text(segment[1]))
if segment[2]:
new_nodes.append(nodes.Text(segment[2]))
continue
if isinstance(segment, str):
if segment.isspace():
new_nodes.append(nodes.Text(segment))
continue
if not segment.strip():
continue
analysis = self.analyze_segment(segment)
if analysis['should_link']:
ipa_link = nodes.Template(name=template_name)
ipa_link.add("1", segment.strip())
new_nodes.append(ipa_link)
else:
new_nodes.append(nodes.Text(segment))
if new_nodes:
parent_list[index:index+1] = new_nodes
self.stats.changes += 1
print(f"Converted allophone template: {raw_content}")
def _create_nodes(self, segments: List) -> List[nodes.Node]:
"""Create processed nodes efficiently."""
new_nodes = []
for segment in segments:
if isinstance(segment, tuple) and len(segment) == 3:
new_nodes.append(nodes.Text(segment[1]))
if segment[2]:
new_nodes.append(nodes.Text(segment[2]))
continue
if isinstance(segment, str) and segment.isspace():
new_nodes.append(nodes.Text(segment))
continue
open_b, close_b, core = self.detect_ipa_brackets(segment)
if not core.strip():
continue
analysis = self.analyze_segment(core)
if not analysis['should_link']:
return [] # Preserve original if any segment shouldn't be linked
if open_b:
new_nodes.append(nodes.Text(open_b))
ipa_link = nodes.Template(name=_STRINGS['IPA link'])
ipa_link.add("1", core)
new_nodes.append(ipa_link)
if close_b:
new_nodes.append(nodes.Text(close_b))
return new_nodes
def process_page(self, page: pywikibot.Page) -> bool:
"""Process single page efficiently."""
print(f"\nProcessing: {page.title()}")
try:
text = page.get()
except pywikibot.exceptions.NoPageError:
print("Page doesn't exist!")
return False
wikicode = parse(text)
self.stats.changes = 0
tables = self.find_tables(text)
self._process_nodes_with_context(wikicode.nodes, 0, tables)
if self.stats.changes:
new_text = str(wikicode)
print(f"\nFound {self.stats.changes} IPA conversion(s)")
pywikibot.showDiff(text, new_text)
if input("Save changes? (y/n): ").lower() == 'y':
page.text = new_text
page.save(summary=f"IPA conversion in phonetic tables ({self.stats.changes} templates)", bot=True)
return True
else:
print("No IPA templates needed conversion")
return False
def _process_nodes_with_context(self, node_list: List[nodes.Node], text_offset: int, tables: List):
"""Process nodes with table context."""
i = 0
current_offset = text_offset
while i < len(node_list):
node = node_list[i]
node_str = str(node)
in_table, table_content = self.is_in_table(current_offset, tables)
if isinstance(node, nodes.Template) and node.name.strip().lower() == _STRINGS['ipa']:
if in_table:
is_relevant, _ = self.contains_phonetic_terms(table_content, 3)
if is_relevant:
self.process_ipa_template(node, node_list, i)
elif isinstance(node, nodes.Tag) and hasattr(node, 'contents') and hasattr(node.contents, 'nodes'):
tag_start_len = len(f"<{node.tag}>")
self._process_nodes_with_context(node.contents.nodes, current_offset + tag_start_len, tables)
current_offset += len(node_str)
i += 1
def process_category(self, category_name: str, depth: int = 0,
max_pages: Optional[int] = None, skip_pages: int = 0) -> Tuple[int, int]:
"""Process category efficiently."""
site = pywikibot.Site('en', 'wikipedia')
cat = pywikibot.Category(site, f"Category:{category_name}")
print(f"\n=== Processing Category: {cat.title()} ===")
all_pages = list(cat.articles(recurse=depth))
pages = [page for page in all_pages if page.namespace() == 0]
print(f"Found {len(all_pages)} total pages, {len(pages)} article space pages")
if max_pages:
print(f"Will process up to {max_pages} pages")
if skip_pages:
print(f"Skipping first {skip_pages} pages")
self.stats.processed_count = 0
self.stats.modified_count = 0
self.stats.skipped_count = 0
for i, page in enumerate(pages):
if max_pages and self.stats.processed_count >= max_pages:
break
if self.stats.skipped_count < skip_pages:
self.stats.skipped_count += 1
continue
print(f"\n[{i+1}/{len(pages)}] Processing article: {page.title()}")
try:
if self.process_page(page):
self.stats.modified_count += 1
self.stats.processed_count += 1
except Exception as e:
print(f"Error processing page {page.title()}: {e}")
print(f"\n=== Category Processing Complete ===")
print(f"Processed {self.stats.processed_count} pages")
print(f"Made changes to {self.stats.modified_count} pages")
return self.stats.processed_count, self.stats.modified_count
def reload_config(self):
"""Reload configuration."""
self._load_config()
self._compile_patterns()
self._build_char_maps()
# Clear cache
self.analyze_segment.cache_clear()
print(f"Configuration reloaded from {self.config_path}")
def main():
"""Main entry point."""
# URL decoding workaround
if not hasattr(pywikibot, 'tools'):
pywikibot.tools = type('', (), {})()
pywikibot.tools.chars = type('', (), {})()
pywikibot.tools.chars.url2string = lambda text, encodings=None: urllib.parse.unquote(
text, encoding=(encodings or ['utf-8'])[0]
)
try:
site = pywikibot.Site('en', 'wikipedia')
if site.logged_in():
print(f"Successfully logged in as: {site.username()}")
else:
print("Not logged in. Please check your authentication.")
return
config_path = input("Enter config file path (or press Enter for default 'ipa_config.yaml'): ").strip()
processor = IPAProcessor(config_path or "ipa_config.yaml")
while True:
print("\nOptions:")
print("1. Process a specific page")
print("2. Process a category")
print("3. Reload configuration")
print("4. Exit")
choice = input("Enter your choice (1-4): ").strip()
if choice == '1':
page_title = input("Enter page title: ").strip()
page = pywikibot.Page(site, page_title)
processor.process_page(page)
elif choice == '2':
category_name = input("Enter category name (without 'Category:' prefix): ").strip()
depth = int(input("Enter recursion depth (0 for just this category): ").strip() or "0")
max_pages_str = input("Enter maximum pages (or enter for no limit): ").strip()
max_pages = int(max_pages_str) if max_pages_str else None
skip_pages = int(input("Enter pages to skip (or enter for none): ").strip() or "0")
processor.process_category(category_name, depth, max_pages, skip_pages)
elif choice == '3':
processor.reload_config()
elif choice == '4':
print("Exiting program.")
break
else:
print("Invalid choice. Please enter 1, 2, 3, or 4.")
except KeyboardInterrupt:
print("\nOperation interrupted by user.")
except Exception as e:
print(f"An error occurred: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()