User:PharyngealImplosive7/IPAlinker.py

The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
Description

This contains old code for PharyngealBOT task 2. It is NOT the current code and is retained for historical reasons. To see that, go to the github repository. – PharyngealImplosive7 (talk) 00:00, 1 June 2025 (UTC)
Actual code

import pywikibot
from mwparserfromhell import parse, nodes
import urllib.parse
import re
import unicodedata
from dataclasses import dataclass
from typing import Set, List, Tuple, Optional, Dict, Any
from functools import lru_cache
import yaml
import sys
from pathlib import Path

# Pre-intern common strings
_STRINGS = {s: sys.intern(s) for s in ['ipa', 'separator', 'IPA link']}

@dataclass
class ProcessingStats:
    __slots__ = ['changes', 'processed_count', 'modified_count', 'skipped_count']
    
    def __init__(self):
        self.changes = 0
        self.processed_count = 0
        self.modified_count = 0
        self.skipped_count = 0

class IPAProcessor:
    """Optimized IPA processor with combined functionality."""
    
    def __init__(self, config_path: str = "ipa_config.yaml"):
        self.config_path = config_path
        self._load_config()
        self._compile_patterns()
        self._build_char_maps()
        self.stats = ProcessingStats()
        print(f"IPA Processor initialized with config: {config_path}")
    
    def _load_config(self):
        """Load and cache configuration."""
        config_file = Path(self.config_path)
        if not config_file.exists():
            raise FileNotFoundError(f"Configuration file not found: {self.config_path}")
        
        with open(config_file, 'r', encoding='utf-8') as f:
            c = yaml.safe_load(f)
        
        # Store config values directly as attributes for faster access
        self.max_ipa_length = c.get('max_ipa_length', 1)
        self.brackets = c.get('brackets', {})
        self.brackets_to_remove = frozenset(c.get('brackets_to_remove', []))
        self.multi_char_exceptions = frozenset(c.get('multi_char_exceptions', []))
        self.vowel_symbols = frozenset(c.get('vowel_symbols', []))
        self.tone_symbols = frozenset(c.get('tone_symbols', []))
        self.click_symbols = frozenset(c.get('click_symbols', []))
        self.diacritics = frozenset(c.get('diacritics', []))
        self.non_ipa_diacritics = frozenset(c.get('non_ipa_diacritics', []))
        self.allophone_separators = frozenset(c.get('allophone_separators', []))
        self.separator_symbols = frozenset(c.get('separator_symbols', []))
        self.phonetic_terms = frozenset(term.lower() for term in c.get('phonetic_terms', []))
        self.tiebar = sys.intern(c.get('tiebar', '͡'))
        self.ejective_marker = sys.intern(c.get('ejective_marker', 'ʼ'))
        
        # Process IPA brackets
        ipa_brackets_raw = c.get('ipa_brackets', {})
        self.ipa_brackets = {}
        for open_b, bracket_data in ipa_brackets_raw.items():
            self.ipa_brackets[sys.intern(open_b)] = (
                sys.intern(bracket_data[0]), 
                sys.intern(bracket_data[1])
            )
        
        self._superscript_pattern = c.get('superscript_regex_pattern', 
                                        r"[\u02B0-\u02FF\u1D00-\u1DBF\u2070-\u209F\u1D2C-\u1D6B]")
    
    def _compile_patterns(self):
        """Compile all regex patterns once."""
        self.superscript_regex = re.compile(self._superscript_pattern)
        separator_pattern = '|'.join(re.escape(sep) for sep in self.separator_symbols)
        self.separator_split_pattern = re.compile(rf'(\s*)({separator_pattern})(\s*)')
        self.table_pattern = re.compile(r'(?:^|\n)\s*:{0,4}\s*\{\|.*?\n\|\}', re.MULTILINE | re.DOTALL)
        self.word_pattern = re.compile(r'\b\w+\b')
        self.space_pattern = re.compile(r'(\s+)')
    
    def _build_char_maps(self):
        """Build efficient character classification maps."""
        self.char_types = {}
        for char in self.diacritics:
            self.char_types[char] = 'diacritic'
        for char in self.tone_symbols:
            self.char_types[char] = 'tone'
        for char in self.vowel_symbols:
            self.char_types[char] = 'vowel'
        for char in self.click_symbols:
            self.char_types[char] = 'click'
        for char in self.non_ipa_diacritics:
            self.char_types[char] = 'non_ipa'
    
    @lru_cache(maxsize=1000)
    def analyze_segment(self, segment: str) -> Dict[str, Any]:
        """Single-pass segment analysis with caching."""
        seg_clean = ''.join(c for c in segment if c not in self.brackets_to_remove and not c.isspace())
        normalized = unicodedata.normalize('NFD', seg_clean)
        
        has_tone = any(c in self.tone_symbols for c in segment) or segment in self.tone_symbols
        has_ejective = self.ejective_marker in segment
        has_diacritic = any(d in normalized for d in self.diacritics) or bool(self.superscript_regex.search(seg_clean))
        has_non_ipa = any(c in self.non_ipa_diacritics for c in normalized)
        has_tiebar = self.tiebar in normalized
        has_click = any(click in normalized for click in self.click_symbols)
        
        # Count vowels for diphthong detection
        clean_for_vowels = ''.join(c for c in seg_clean 
                                 if c not in self.diacritics and not self.superscript_regex.match(c))
        vowel_count = sum(1 for c in clean_for_vowels.lower() if c in self.vowel_symbols)
        is_diphthong = vowel_count >= 2
        
        # Check if ejective consonant
        is_ejective_consonant = False
        if has_ejective:
            base = segment.replace(self.ejective_marker, '')
            is_ejective_consonant = (len(base.strip()) == 1 and 
                                   base.lower() not in self.vowel_symbols and base.strip())
        
        # Determine if special segment
        is_special = (seg_clean.lower() in self.multi_char_exceptions or
                     has_diacritic or has_tiebar or has_click or
                     is_ejective_consonant or len(seg_clean.strip()) == 1)
        
        # Should link determination
        should_link = (not has_tone and not is_diphthong and not has_non_ipa and
                      seg_clean.strip() and (is_ejective_consonant or is_special))
        
        return {
            'has_tone': has_tone,
            'is_diphthong': is_diphthong,
            'has_non_ipa': has_non_ipa,
            'is_ejective_consonant': is_ejective_consonant,
            'should_link': should_link,
            'clean': seg_clean
        }
    
    def detect_ipa_brackets(self, segment: str) -> Tuple[Optional[str], Optional[str], str, Optional[str]]:
        """Fast bracket detection."""
        segment = segment.strip()
        
        # Check IPA-specific brackets first
        for open_b, (close_b, template_name) in self.ipa_brackets.items():
            if segment.startswith(open_b) and segment.endswith(close_b):
                content = segment[len(open_b):-len(close_b)].strip()
                return open_b, close_b, content, template_name
        
        # Fallback to regular brackets
        for open_b, close_b in self.brackets.items():
            if segment.startswith(open_b) and segment.endswith(close_b):
                content = segment[len(open_b):-len(close_b)].strip()
                return open_b, close_b, content, None
        
        return None, None, segment, None
    
    def tokenize_content(self, content: str) -> List:
        """Fast tokenization with single regex split."""
        result = []
        parts = self.separator_split_pattern.split(content)
        
        for part in parts:
            if not part:
                continue
            if part.strip() in self.separator_symbols:
                result.append((_STRINGS['separator'], part.strip(), ''))
            elif part.isspace():
                continue
            else:
                if part.strip():
                    space_parts = self.space_pattern.split(part)
                    for space_part in space_parts:
                        if space_part.strip():
                            result.append(space_part.strip())
                        elif space_part.isspace():
                            result.append(space_part)
        return result
    
    def contains_phonetic_terms(self, text: str, min_terms: int = 3) -> Tuple[bool, List[str]]:
        """Fast phonetic term detection."""
        words = set(self.word_pattern.findall(text.lower()))
        matched = [w for w in words if w in self.phonetic_terms]
        return len(matched) >= min_terms, matched[:min_terms] if matched else []
    
    def find_tables(self, text: str) -> List[Tuple[int, int, str]]:
        """Find table boundaries."""
        return [(m.start(), m.end(), m.group()) for m in self.table_pattern.finditer(text)]
    
    def is_in_table(self, pos: int, tables: List[Tuple[int, int, str]]) -> Tuple[bool, Optional[str]]:
        """Check if position is in any table."""
        for start, end, content in tables:
            if start <= pos <= end:
                return True, content
        return False, None
    
    def process_ipa_template(self, node: nodes.Template, parent_list: List, index: int) -> None:
        """Process IPA template with optimized logic."""
        if node.name.strip().lower() != _STRINGS['ipa']:
            return
        
        raw_content = str(node.params[0].value).strip()
        open_b, close_b, inner_content, template_name = self.detect_ipa_brackets(raw_content)
        
        # Handle IPA brackets with separators
        if (open_b and close_b and template_name and 
            any(sep in inner_content for sep in self.separator_symbols)):
            self._process_bracketed_allophones(node, parent_list, index, raw_content,
                                             open_b, close_b, inner_content, template_name)
            return
        
        # Handle simple IPA brackets
        if open_b and close_b and template_name:
            self._process_simple_brackets(node, parent_list, index, raw_content,
                                        open_b, close_b, inner_content, template_name)
            return
        
        # Fallback processing
        segments = self.tokenize_content(raw_content)
        if not any(isinstance(s, str) and not s.isspace() and 
                  self.analyze_segment(s if isinstance(s, str) else s[1])['should_link'] 
                  for s in segments):
            return
        
        new_nodes = self._create_nodes(segments)
        if new_nodes:
            parent_list[index:index+1] = new_nodes
            self.stats.changes += 1
            print(f"Converted IPA template: {raw_content}")
    
    def _process_simple_brackets(self, node, parent_list, index, raw_content, 
                               open_b, close_b, inner_content, template_name):
        """Handle simple IPA brackets."""
        if not inner_content.strip():
            return
        
        analysis = self.analyze_segment(inner_content)
        if analysis['should_link']:
            ipa_link = nodes.Template(name=template_name)
            ipa_link.add("1", inner_content.strip())
            parent_list[index:index+1] = [ipa_link]
            self.stats.changes += 1
            print(f"Converted: {raw_content} -> {{{{{template_name}|{inner_content.strip()}}}}}")
    
    def _process_bracketed_allophones(self, node, parent_list, index, raw_content,
                                    open_b, close_b, inner_content, template_name):
        """Handle bracketed allophone sets."""
        segments = self.tokenize_content(inner_content)
        new_nodes = []
        
        for segment in segments:
            if isinstance(segment, tuple) and len(segment) == 3:
                new_nodes.append(nodes.Text(segment[1]))
                if segment[2]:
                    new_nodes.append(nodes.Text(segment[2]))
                continue
            
            if isinstance(segment, str):
                if segment.isspace():
                    new_nodes.append(nodes.Text(segment))
                    continue
                
                if not segment.strip():
                    continue
                
                analysis = self.analyze_segment(segment)
                if analysis['should_link']:
                    ipa_link = nodes.Template(name=template_name)
                    ipa_link.add("1", segment.strip())
                    new_nodes.append(ipa_link)
                else:
                    new_nodes.append(nodes.Text(segment))
        
        if new_nodes:
            parent_list[index:index+1] = new_nodes
            self.stats.changes += 1
            print(f"Converted allophone template: {raw_content}")
    
    def _create_nodes(self, segments: List) -> List[nodes.Node]:
        """Create processed nodes efficiently."""
        new_nodes = []
        
        for segment in segments:
            if isinstance(segment, tuple) and len(segment) == 3:
                new_nodes.append(nodes.Text(segment[1]))
                if segment[2]:
                    new_nodes.append(nodes.Text(segment[2]))
                continue
            
            if isinstance(segment, str) and segment.isspace():
                new_nodes.append(nodes.Text(segment))
                continue
            
            open_b, close_b, core = self.detect_ipa_brackets(segment)
            if not core.strip():
                continue
            
            analysis = self.analyze_segment(core)
            
            if not analysis['should_link']:
                return []  # Preserve original if any segment shouldn't be linked
            
            if open_b:
                new_nodes.append(nodes.Text(open_b))
            
            ipa_link = nodes.Template(name=_STRINGS['IPA link'])
            ipa_link.add("1", core)
            new_nodes.append(ipa_link)
            
            if close_b:
                new_nodes.append(nodes.Text(close_b))
        
        return new_nodes
    
    def process_page(self, page: pywikibot.Page) -> bool:
        """Process single page efficiently."""
        print(f"\nProcessing: {page.title()}")
        
        try:
            text = page.get()
        except pywikibot.exceptions.NoPageError:
            print("Page doesn't exist!")
            return False
        
        wikicode = parse(text)
        self.stats.changes = 0
        tables = self.find_tables(text)
        
        self._process_nodes_with_context(wikicode.nodes, 0, tables)
        
        if self.stats.changes:
            new_text = str(wikicode)
            print(f"\nFound {self.stats.changes} IPA conversion(s)")
            pywikibot.showDiff(text, new_text)
            
            if input("Save changes? (y/n): ").lower() == 'y':
                page.text = new_text
                page.save(summary=f"IPA conversion in phonetic tables ({self.stats.changes} templates)", bot=True)
                return True
        else:
            print("No IPA templates needed conversion")
        
        return False
    
    def _process_nodes_with_context(self, node_list: List[nodes.Node], text_offset: int, tables: List):
        """Process nodes with table context."""
        i = 0
        current_offset = text_offset
        
        while i < len(node_list):
            node = node_list[i]
            node_str = str(node)
            
            in_table, table_content = self.is_in_table(current_offset, tables)
            
            if isinstance(node, nodes.Template) and node.name.strip().lower() == _STRINGS['ipa']:
                if in_table:
                    is_relevant, _ = self.contains_phonetic_terms(table_content, 3)
                    if is_relevant:
                        self.process_ipa_template(node, node_list, i)
            
            elif isinstance(node, nodes.Tag) and hasattr(node, 'contents') and hasattr(node.contents, 'nodes'):
                tag_start_len = len(f"<{node.tag}>")
                self._process_nodes_with_context(node.contents.nodes, current_offset + tag_start_len, tables)
            
            current_offset += len(node_str)
            i += 1
    
    def process_category(self, category_name: str, depth: int = 0, 
                        max_pages: Optional[int] = None, skip_pages: int = 0) -> Tuple[int, int]:
        """Process category efficiently."""
        site = pywikibot.Site('en', 'wikipedia')
        cat = pywikibot.Category(site, f"Category:{category_name}")
        
        print(f"\n=== Processing Category: {cat.title()} ===")
        
        all_pages = list(cat.articles(recurse=depth))
        pages = [page for page in all_pages if page.namespace() == 0]
        
        print(f"Found {len(all_pages)} total pages, {len(pages)} article space pages")
        if max_pages:
            print(f"Will process up to {max_pages} pages")
        if skip_pages:
            print(f"Skipping first {skip_pages} pages")
        
        self.stats.processed_count = 0
        self.stats.modified_count = 0
        self.stats.skipped_count = 0
        
        for i, page in enumerate(pages):
            if max_pages and self.stats.processed_count >= max_pages:
                break
            
            if self.stats.skipped_count < skip_pages:
                self.stats.skipped_count += 1
                continue
            
            print(f"\n[{i+1}/{len(pages)}] Processing article: {page.title()}")
            
            try:
                if self.process_page(page):
                    self.stats.modified_count += 1
                self.stats.processed_count += 1
            except Exception as e:
                print(f"Error processing page {page.title()}: {e}")
        
        print(f"\n=== Category Processing Complete ===")
        print(f"Processed {self.stats.processed_count} pages")
        print(f"Made changes to {self.stats.modified_count} pages")
        
        return self.stats.processed_count, self.stats.modified_count
    
    def reload_config(self):
        """Reload configuration."""
        self._load_config()
        self._compile_patterns()
        self._build_char_maps()
        # Clear cache
        self.analyze_segment.cache_clear()
        print(f"Configuration reloaded from {self.config_path}")

def main():
    """Main entry point."""
    # URL decoding workaround
    if not hasattr(pywikibot, 'tools'):
        pywikibot.tools = type('', (), {})()
        pywikibot.tools.chars = type('', (), {})()
        pywikibot.tools.chars.url2string = lambda text, encodings=None: urllib.parse.unquote(
            text, encoding=(encodings or ['utf-8'])[0]
        )
    
    try:
        site = pywikibot.Site('en', 'wikipedia')
        
        if site.logged_in():
            print(f"Successfully logged in as: {site.username()}")
        else:
            print("Not logged in. Please check your authentication.")
            return
        
        config_path = input("Enter config file path (or press Enter for default 'ipa_config.yaml'): ").strip()
        processor = IPAProcessor(config_path or "ipa_config.yaml")
        
        while True:
            print("\nOptions:")
            print("1. Process a specific page")
            print("2. Process a category") 
            print("3. Reload configuration")
            print("4. Exit")
            
            choice = input("Enter your choice (1-4): ").strip()
            
            if choice == '1':
                page_title = input("Enter page title: ").strip()
                page = pywikibot.Page(site, page_title)
                processor.process_page(page)
            
            elif choice == '2':
                category_name = input("Enter category name (without 'Category:' prefix): ").strip()
                depth = int(input("Enter recursion depth (0 for just this category): ").strip() or "0")
                max_pages_str = input("Enter maximum pages (or enter for no limit): ").strip()
                max_pages = int(max_pages_str) if max_pages_str else None
                skip_pages = int(input("Enter pages to skip (or enter for none): ").strip() or "0")
                
                processor.process_category(category_name, depth, max_pages, skip_pages)
            
            elif choice == '3':
                processor.reload_config()
            
            elif choice == '4':
                print("Exiting program.")
                break
            
            else:
                print("Invalid choice. Please enter 1, 2, 3, or 4.")
    
    except KeyboardInterrupt:
        print("\nOperation interrupted by user.")
    except Exception as e:
        print(f"An error occurred: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    main()