Jump to content

User:PharyngealImplosive7/IPAlinker.py

From Wikipedia, the free encyclopedia

Description

[edit]

This contains old code for PharyngealBOT task 2. It is NOT the current code and is retained for historical reasons. To see that, go to the github repository. – PharyngealImplosive7 (talk) 00:00, 1 June 2025 (UTC)

Actual code

[edit]
import pywikibot
from mwparserfromhell import parse, nodes
import urllib.parse
import re
import unicodedata
from dataclasses import dataclass
from typing import Set, List, Tuple, Optional, Dict, Any
from functools import lru_cache
import yaml
import sys
from pathlib import Path

# Pre-intern common strings
_STRINGS = {s: sys.intern(s) for s in ['ipa', 'separator', 'IPA link']}

@dataclass
class ProcessingStats:
    __slots__ = ['changes', 'processed_count', 'modified_count', 'skipped_count']
    
    def __init__(self):
        self.changes = 0
        self.processed_count = 0
        self.modified_count = 0
        self.skipped_count = 0

class IPAProcessor:
    """Optimized IPA processor with combined functionality."""
    
    def __init__(self, config_path: str = "ipa_config.yaml"):
        self.config_path = config_path
        self._load_config()
        self._compile_patterns()
        self._build_char_maps()
        self.stats = ProcessingStats()
        print(f"IPA Processor initialized with config: {config_path}")
    
    def _load_config(self):
        """Load and cache configuration."""
        config_file = Path(self.config_path)
        if not config_file.exists():
            raise FileNotFoundError(f"Configuration file not found: {self.config_path}")
        
        with open(config_file, 'r', encoding='utf-8') as f:
            c = yaml.safe_load(f)
        
        # Store config values directly as attributes for faster access
        self.max_ipa_length = c.get('max_ipa_length', 1)
        self.brackets = c.get('brackets', {})
        self.brackets_to_remove = frozenset(c.get('brackets_to_remove', []))
        self.multi_char_exceptions = frozenset(c.get('multi_char_exceptions', []))
        self.vowel_symbols = frozenset(c.get('vowel_symbols', []))
        self.tone_symbols = frozenset(c.get('tone_symbols', []))
        self.click_symbols = frozenset(c.get('click_symbols', []))
        self.diacritics = frozenset(c.get('diacritics', []))
        self.non_ipa_diacritics = frozenset(c.get('non_ipa_diacritics', []))
        self.allophone_separators = frozenset(c.get('allophone_separators', []))
        self.separator_symbols = frozenset(c.get('separator_symbols', []))
        self.phonetic_terms = frozenset(term.lower() for term in c.get('phonetic_terms', []))
        self.tiebar = sys.intern(c.get('tiebar', '͡'))
        self.ejective_marker = sys.intern(c.get('ejective_marker', 'ʼ'))
        
        # Process IPA brackets
        ipa_brackets_raw = c.get('ipa_brackets', {})
        self.ipa_brackets = {}
        for open_b, bracket_data in ipa_brackets_raw.items():
            self.ipa_brackets[sys.intern(open_b)] = (
                sys.intern(bracket_data[0]), 
                sys.intern(bracket_data[1])
            )
        
        self._superscript_pattern = c.get('superscript_regex_pattern', 
                                        r"[\u02B0-\u02FF\u1D00-\u1DBF\u2070-\u209F\u1D2C-\u1D6B]")
    
    def _compile_patterns(self):
        """Compile all regex patterns once."""
        self.superscript_regex = re.compile(self._superscript_pattern)
        separator_pattern = '|'.join(re.escape(sep) for sep in self.separator_symbols)
        self.separator_split_pattern = re.compile(rf'(\s*)({separator_pattern})(\s*)')
        self.table_pattern = re.compile(r'(?:^|\n)\s*:{0,4}\s*\{\|.*?\n\|\}', re.MULTILINE | re.DOTALL)
        self.word_pattern = re.compile(r'\b\w+\b')
        self.space_pattern = re.compile(r'(\s+)')
    
    def _build_char_maps(self):
        """Build efficient character classification maps."""
        self.char_types = {}
        for char in self.diacritics:
            self.char_types[char] = 'diacritic'
        for char in self.tone_symbols:
            self.char_types[char] = 'tone'
        for char in self.vowel_symbols:
            self.char_types[char] = 'vowel'
        for char in self.click_symbols:
            self.char_types[char] = 'click'
        for char in self.non_ipa_diacritics:
            self.char_types[char] = 'non_ipa'
    
    @lru_cache(maxsize=1000)
    def analyze_segment(self, segment: str) -> Dict[str, Any]:
        """Single-pass segment analysis with caching."""
        seg_clean = ''.join(c for c in segment if c not in self.brackets_to_remove and not c.isspace())
        normalized = unicodedata.normalize('NFD', seg_clean)
        
        has_tone = any(c in self.tone_symbols for c in segment) or segment in self.tone_symbols
        has_ejective = self.ejective_marker in segment
        has_diacritic = any(d in normalized for d in self.diacritics) or bool(self.superscript_regex.search(seg_clean))
        has_non_ipa = any(c in self.non_ipa_diacritics for c in normalized)
        has_tiebar = self.tiebar in normalized
        has_click = any(click in normalized for click in self.click_symbols)
        
        # Count vowels for diphthong detection
        clean_for_vowels = ''.join(c for c in seg_clean 
                                 if c not in self.diacritics and not self.superscript_regex.match(c))
        vowel_count = sum(1 for c in clean_for_vowels.lower() if c in self.vowel_symbols)
        is_diphthong = vowel_count >= 2
        
        # Check if ejective consonant
        is_ejective_consonant = False
        if has_ejective:
            base = segment.replace(self.ejective_marker, '')
            is_ejective_consonant = (len(base.strip()) == 1 and 
                                   base.lower() not in self.vowel_symbols and base.strip())
        
        # Determine if special segment
        is_special = (seg_clean.lower() in self.multi_char_exceptions or
                     has_diacritic or has_tiebar or has_click or
                     is_ejective_consonant or len(seg_clean.strip()) == 1)
        
        # Should link determination
        should_link = (not has_tone and not is_diphthong and not has_non_ipa and
                      seg_clean.strip() and (is_ejective_consonant or is_special))
        
        return {
            'has_tone': has_tone,
            'is_diphthong': is_diphthong,
            'has_non_ipa': has_non_ipa,
            'is_ejective_consonant': is_ejective_consonant,
            'should_link': should_link,
            'clean': seg_clean
        }
    
    def detect_ipa_brackets(self, segment: str) -> Tuple[Optional[str], Optional[str], str, Optional[str]]:
        """Fast bracket detection."""
        segment = segment.strip()
        
        # Check IPA-specific brackets first
        for open_b, (close_b, template_name) in self.ipa_brackets.items():
            if segment.startswith(open_b) and segment.endswith(close_b):
                content = segment[len(open_b):-len(close_b)].strip()
                return open_b, close_b, content, template_name
        
        # Fallback to regular brackets
        for open_b, close_b in self.brackets.items():
            if segment.startswith(open_b) and segment.endswith(close_b):
                content = segment[len(open_b):-len(close_b)].strip()
                return open_b, close_b, content, None
        
        return None, None, segment, None
    
    def tokenize_content(self, content: str) -> List:
        """Fast tokenization with single regex split."""
        result = []
        parts = self.separator_split_pattern.split(content)
        
        for part in parts:
            if not part:
                continue
            if part.strip() in self.separator_symbols:
                result.append((_STRINGS['separator'], part.strip(), ''))
            elif part.isspace():
                continue
            else:
                if part.strip():
                    space_parts = self.space_pattern.split(part)
                    for space_part in space_parts:
                        if space_part.strip():
                            result.append(space_part.strip())
                        elif space_part.isspace():
                            result.append(space_part)
        return result
    
    def contains_phonetic_terms(self, text: str, min_terms: int = 3) -> Tuple[bool, List[str]]:
        """Fast phonetic term detection."""
        words = set(self.word_pattern.findall(text.lower()))
        matched = [w for w in words if w in self.phonetic_terms]
        return len(matched) >= min_terms, matched[:min_terms] if matched else []
    
    def find_tables(self, text: str) -> List[Tuple[int, int, str]]:
        """Find table boundaries."""
        return [(m.start(), m.end(), m.group()) for m in self.table_pattern.finditer(text)]
    
    def is_in_table(self, pos: int, tables: List[Tuple[int, int, str]]) -> Tuple[bool, Optional[str]]:
        """Check if position is in any table."""
        for start, end, content in tables:
            if start <= pos <= end:
                return True, content
        return False, None
    
    def process_ipa_template(self, node: nodes.Template, parent_list: List, index: int) -> None:
        """Process IPA template with optimized logic."""
        if node.name.strip().lower() != _STRINGS['ipa']:
            return
        
        raw_content = str(node.params[0].value).strip()
        open_b, close_b, inner_content, template_name = self.detect_ipa_brackets(raw_content)
        
        # Handle IPA brackets with separators
        if (open_b and close_b and template_name and 
            any(sep in inner_content for sep in self.separator_symbols)):
            self._process_bracketed_allophones(node, parent_list, index, raw_content,
                                             open_b, close_b, inner_content, template_name)
            return
        
        # Handle simple IPA brackets
        if open_b and close_b and template_name:
            self._process_simple_brackets(node, parent_list, index, raw_content,
                                        open_b, close_b, inner_content, template_name)
            return
        
        # Fallback processing
        segments = self.tokenize_content(raw_content)
        if not any(isinstance(s, str) and not s.isspace() and 
                  self.analyze_segment(s if isinstance(s, str) else s[1])['should_link'] 
                  for s in segments):
            return
        
        new_nodes = self._create_nodes(segments)
        if new_nodes:
            parent_list[index:index+1] = new_nodes
            self.stats.changes += 1
            print(f"Converted IPA template: {raw_content}")
    
    def _process_simple_brackets(self, node, parent_list, index, raw_content, 
                               open_b, close_b, inner_content, template_name):
        """Handle simple IPA brackets."""
        if not inner_content.strip():
            return
        
        analysis = self.analyze_segment(inner_content)
        if analysis['should_link']:
            ipa_link = nodes.Template(name=template_name)
            ipa_link.add("1", inner_content.strip())
            parent_list[index:index+1] = [ipa_link]
            self.stats.changes += 1
            print(f"Converted: {raw_content} -> {{{{{template_name}|{inner_content.strip()}}}}}")
    
    def _process_bracketed_allophones(self, node, parent_list, index, raw_content,
                                    open_b, close_b, inner_content, template_name):
        """Handle bracketed allophone sets."""
        segments = self.tokenize_content(inner_content)
        new_nodes = []
        
        for segment in segments:
            if isinstance(segment, tuple) and len(segment) == 3:
                new_nodes.append(nodes.Text(segment[1]))
                if segment[2]:
                    new_nodes.append(nodes.Text(segment[2]))
                continue
            
            if isinstance(segment, str):
                if segment.isspace():
                    new_nodes.append(nodes.Text(segment))
                    continue
                
                if not segment.strip():
                    continue
                
                analysis = self.analyze_segment(segment)
                if analysis['should_link']:
                    ipa_link = nodes.Template(name=template_name)
                    ipa_link.add("1", segment.strip())
                    new_nodes.append(ipa_link)
                else:
                    new_nodes.append(nodes.Text(segment))
        
        if new_nodes:
            parent_list[index:index+1] = new_nodes
            self.stats.changes += 1
            print(f"Converted allophone template: {raw_content}")
    
    def _create_nodes(self, segments: List) -> List[nodes.Node]:
        """Create processed nodes efficiently."""
        new_nodes = []
        
        for segment in segments:
            if isinstance(segment, tuple) and len(segment) == 3:
                new_nodes.append(nodes.Text(segment[1]))
                if segment[2]:
                    new_nodes.append(nodes.Text(segment[2]))
                continue
            
            if isinstance(segment, str) and segment.isspace():
                new_nodes.append(nodes.Text(segment))
                continue
            
            open_b, close_b, core = self.detect_ipa_brackets(segment)
            if not core.strip():
                continue
            
            analysis = self.analyze_segment(core)
            
            if not analysis['should_link']:
                return []  # Preserve original if any segment shouldn't be linked
            
            if open_b:
                new_nodes.append(nodes.Text(open_b))
            
            ipa_link = nodes.Template(name=_STRINGS['IPA link'])
            ipa_link.add("1", core)
            new_nodes.append(ipa_link)
            
            if close_b:
                new_nodes.append(nodes.Text(close_b))
        
        return new_nodes
    
    def process_page(self, page: pywikibot.Page) -> bool:
        """Process single page efficiently."""
        print(f"\nProcessing: {page.title()}")
        
        try:
            text = page.get()
        except pywikibot.exceptions.NoPageError:
            print("Page doesn't exist!")
            return False
        
        wikicode = parse(text)
        self.stats.changes = 0
        tables = self.find_tables(text)
        
        self._process_nodes_with_context(wikicode.nodes, 0, tables)
        
        if self.stats.changes:
            new_text = str(wikicode)
            print(f"\nFound {self.stats.changes} IPA conversion(s)")
            pywikibot.showDiff(text, new_text)
            
            if input("Save changes? (y/n): ").lower() == 'y':
                page.text = new_text
                page.save(summary=f"IPA conversion in phonetic tables ({self.stats.changes} templates)", bot=True)
                return True
        else:
            print("No IPA templates needed conversion")
        
        return False
    
    def _process_nodes_with_context(self, node_list: List[nodes.Node], text_offset: int, tables: List):
        """Process nodes with table context."""
        i = 0
        current_offset = text_offset
        
        while i < len(node_list):
            node = node_list[i]
            node_str = str(node)
            
            in_table, table_content = self.is_in_table(current_offset, tables)
            
            if isinstance(node, nodes.Template) and node.name.strip().lower() == _STRINGS['ipa']:
                if in_table:
                    is_relevant, _ = self.contains_phonetic_terms(table_content, 3)
                    if is_relevant:
                        self.process_ipa_template(node, node_list, i)
            
            elif isinstance(node, nodes.Tag) and hasattr(node, 'contents') and hasattr(node.contents, 'nodes'):
                tag_start_len = len(f"<{node.tag}>")
                self._process_nodes_with_context(node.contents.nodes, current_offset + tag_start_len, tables)
            
            current_offset += len(node_str)
            i += 1
    
    def process_category(self, category_name: str, depth: int = 0, 
                        max_pages: Optional[int] = None, skip_pages: int = 0) -> Tuple[int, int]:
        """Process category efficiently."""
        site = pywikibot.Site('en', 'wikipedia')
        cat = pywikibot.Category(site, f"Category:{category_name}")
        
        print(f"\n=== Processing Category: {cat.title()} ===")
        
        all_pages = list(cat.articles(recurse=depth))
        pages = [page for page in all_pages if page.namespace() == 0]
        
        print(f"Found {len(all_pages)} total pages, {len(pages)} article space pages")
        if max_pages:
            print(f"Will process up to {max_pages} pages")
        if skip_pages:
            print(f"Skipping first {skip_pages} pages")
        
        self.stats.processed_count = 0
        self.stats.modified_count = 0
        self.stats.skipped_count = 0
        
        for i, page in enumerate(pages):
            if max_pages and self.stats.processed_count >= max_pages:
                break
            
            if self.stats.skipped_count < skip_pages:
                self.stats.skipped_count += 1
                continue
            
            print(f"\n[{i+1}/{len(pages)}] Processing article: {page.title()}")
            
            try:
                if self.process_page(page):
                    self.stats.modified_count += 1
                self.stats.processed_count += 1
            except Exception as e:
                print(f"Error processing page {page.title()}: {e}")
        
        print(f"\n=== Category Processing Complete ===")
        print(f"Processed {self.stats.processed_count} pages")
        print(f"Made changes to {self.stats.modified_count} pages")
        
        return self.stats.processed_count, self.stats.modified_count
    
    def reload_config(self):
        """Reload configuration."""
        self._load_config()
        self._compile_patterns()
        self._build_char_maps()
        # Clear cache
        self.analyze_segment.cache_clear()
        print(f"Configuration reloaded from {self.config_path}")

def main():
    """Main entry point."""
    # URL decoding workaround
    if not hasattr(pywikibot, 'tools'):
        pywikibot.tools = type('', (), {})()
        pywikibot.tools.chars = type('', (), {})()
        pywikibot.tools.chars.url2string = lambda text, encodings=None: urllib.parse.unquote(
            text, encoding=(encodings or ['utf-8'])[0]
        )
    
    try:
        site = pywikibot.Site('en', 'wikipedia')
        
        if site.logged_in():
            print(f"Successfully logged in as: {site.username()}")
        else:
            print("Not logged in. Please check your authentication.")
            return
        
        config_path = input("Enter config file path (or press Enter for default 'ipa_config.yaml'): ").strip()
        processor = IPAProcessor(config_path or "ipa_config.yaml")
        
        while True:
            print("\nOptions:")
            print("1. Process a specific page")
            print("2. Process a category") 
            print("3. Reload configuration")
            print("4. Exit")
            
            choice = input("Enter your choice (1-4): ").strip()
            
            if choice == '1':
                page_title = input("Enter page title: ").strip()
                page = pywikibot.Page(site, page_title)
                processor.process_page(page)
            
            elif choice == '2':
                category_name = input("Enter category name (without 'Category:' prefix): ").strip()
                depth = int(input("Enter recursion depth (0 for just this category): ").strip() or "0")
                max_pages_str = input("Enter maximum pages (or enter for no limit): ").strip()
                max_pages = int(max_pages_str) if max_pages_str else None
                skip_pages = int(input("Enter pages to skip (or enter for none): ").strip() or "0")
                
                processor.process_category(category_name, depth, max_pages, skip_pages)
            
            elif choice == '3':
                processor.reload_config()
            
            elif choice == '4':
                print("Exiting program.")
                break
            
            else:
                print("Invalid choice. Please enter 1, 2, 3, or 4.")
    
    except KeyboardInterrupt:
        print("\nOperation interrupted by user.")
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()