模組:Conversion rule extractor/Matcher

-- Module:Conversion_rule_extractor/Matcher
-- 子模块：负责匹配规则与目标页面内容

local Matcher = {}

-- 工具函数：从规则字符串中提取需要匹配的源文本
-- 例如：'zh-cn:图尔库;zh-tw:土庫;' -> {"图尔库", "土庫"}
-- 例如：'巨集=>zh-cn:宏;' -> {"巨集"}
local function extractRuleSources(ruleString)
    local sources = {}
    local sourceSet = {} -- 用于去重

    -- 移除外层包裹（如果存在，尽管 Extractor 通常会清理掉）
    ruleString = ruleString:match('^%-{.-|(.*)}%-$') or ruleString

    for part in mw.text.gsplit(ruleString, ';') do
        part = mw.text.trim(part)
        if part ~= '' then
            local source
            local unidirectionalMatch = part:match('^([^=]-)=>') -- 检查单向规则 A=>B
            local bidirectionalMatch = part:match('^%w+%-%w+:(.+)') -- 检查双向规则 lang:Text
            local simpleBidirectionalMatch = part:match('^([^:]+):(.+)') -- 检查简单的双向规则 Text:Variant (不太标准，但可能存在)
            local fallbackMatch = part:match('^([^=:]+)') -- 如果没有=>或:，取整个部分作为源？(可能不太安全，但作为后备)

            if unidirectionalMatch then
                source = mw.text.trim(unidirectionalMatch)
            elseif bidirectionalMatch then
                source = mw.text.trim(bidirectionalMatch)
            elseif simpleBidirectionalMatch then
            	-- 对于 Text:Variant 格式，我们假设 Text 是要匹配的源
                source = mw.text.trim(simpleBidirectionalMatch)
            elseif fallbackMatch and not part:find('=') and not part:find(':') then
                 -- 只有在没有 => 和 : 时才考虑整个部分作为源，例如 "單純文字" 这种无效但可能存在的规则
                 source = mw.text.trim(fallbackMatch)
            end

            if source and source ~= '' and not sourceSet[source] then
                table.insert(sources, source)
                sourceSet[source] = true
                -- mw.log('Extracted source:', source, 'from part:', part)
            -- else
                -- mw.log('Could not extract source from part:', part)
            end
        end
    end
    -- mw.logObject('Extracted sources for rule "' .. ruleString .. '":', sources)
    return sources
end


-- 构建用于匹配的Trie树 (改编自 Module:NoteTA-lite)
-- 输入: rulesList - 一个包含规则字符串的列表
-- 输出: Trie树，叶子节点存储规则在 rulesList 中的索引列表
function Matcher.buildRuleTrie(rulesList)
    local trie = {}
    local ruleSourcesMap = {} -- 存储每个源文本对应的规则索引列表 { ["源文本"] = {idx1, idx2} }

    for index, ruleString in ipairs(rulesList) do
        local sources = extractRuleSources(ruleString)
        for _, source in ipairs(sources) do
            if not ruleSourcesMap[source] then
                ruleSourcesMap[source] = {}
            end
            table.insert(ruleSourcesMap[source], index)
            -- mw.log('Mapping source:', source, 'to index:', index)
        end
    end

    -- 构建Trie
    for source, indices in pairs(ruleSourcesMap) do
        local currentNode = trie
        -- 使用 mw.ustring 处理 UTF-8 字符
        for i = 1, mw.ustring.len(source) do
            local char = mw.ustring.sub(source, i, i)
            currentNode[char] = currentNode[char] or {}
            currentNode = currentNode[char]
        end
        -- 在叶子节点存储规则索引列表
        currentNode.indices = indices
        -- mw.log('Added indices to Trie node for source:', source, indices)
    end

    return trie
end

-- 使用Trie树在文本中查找匹配的规则 (改编自 Module:NoteTA-lite)
-- 输入: text - 要搜索的文本内容
-- 输入: trie - Matcher.buildRuleTrie 构建的Trie树
-- 输出: matchedIndices - 一个集合 (table)，key 是匹配到的规则索引，value 是 true
function Matcher.matchTextWithTrie(text, trie)
    local matchedIndices = {}
    if not text or text == '' then return matchedIndices end

    local len = mw.ustring.len(text)
    for i = 1, len do
        local currentNode = trie
        for j = i, len do
            local char = mw.ustring.sub(text, j, j)
            if not currentNode[char] then
                break -- 没有后续匹配
            end
            currentNode = currentNode[char]
            -- 检查当前节点是否是某个源文本的结尾
            if currentNode.indices then
                -- mw.log('Match found ending at pos', j, 'for source ending with char', char)
                for _, index in ipairs(currentNode.indices) do
                    if not matchedIndices[index] then
                        -- mw.log('Recording match for rule index:', index)
                        matchedIndices[index] = true
                    end
                end
                -- 继续检查更长的匹配
            end
        end
    end
    -- mw.logObject('Indices matched in text:', matchedIndices)
    return matchedIndices
end

-- 主函数：筛选规则列表，只保留在目标页面内容中能匹配到的规则
-- 输入: rulesList - 包含规则字符串的列表
-- 输入: targetPageTitleOrText - 目标页面的标题字符串 或 直接的文本内容
-- 输出: filteredRules - 只包含匹配到的规则字符串的列表
function Matcher.filterRules(rulesList, targetPageTitleOrText)
    local filteredRules = {}
    if not rulesList or #rulesList == 0 then
        return filteredRules
    end

    local textContent
    if type(targetPageTitleOrText) == 'string' then
        -- 检查是页面标题还是直接文本
        local titleObj = mw.title.new(targetPageTitleOrText)
        if titleObj and titleObj.exists then
            -- 是有效的页面标题，获取内容
             textContent = titleObj:getContent()
             -- mw.log('Matching against content of page:', targetPageTitleOrText)
        else
            -- 认为是直接的文本内容
             textContent = targetPageTitleOrText
             -- mw.log('Matching against provided text string.')
        end
    else
        -- 如果传入的是 title 对象
        if targetPageTitleOrText and targetPageTitleOrText.getContent then
             textContent = targetPageTitleOrText:getContent()
             -- mw.log('Matching against content of provided title object:', targetPageTitleOrText.prefixedText)
        else
             -- mw.log('Invalid target provided for matching.')
             return filteredRules -- 无法获取内容，返回空
        end
    end


    if not textContent or textContent == '' then
        -- mw.log('Target content is empty, no rules will match.')
        return filteredRules -- 没有内容可匹配
    end

    local trie = Matcher.buildRuleTrie(rulesList)
    local matchedIndices = Matcher.matchTextWithTrie(textContent, trie)

    for index, rule in ipairs(rulesList) do
        if matchedIndices[index] then
            table.insert(filteredRules, rule)
            -- mw.log('Rule matched and kept:', rule)
        -- else
            -- mw.log('Rule did not match:', rule)
        end
    end

    return filteredRules
end

-- 专门用于匹配标题的函数，只使用标题文本进行匹配
function Matcher.filterRulesAgainstTitleText(rulesList, pageTitle)
    local filteredRules = {}
    if not rulesList or #rulesList == 0 then
        return filteredRules
    end

    local titleObj = mw.title.new(pageTitle)
    if not titleObj then
        -- mw.log('Invalid title provided for title text matching:', pageTitle)
        return filteredRules
    end

    local titleText = titleObj.text -- 获取不含名字空间的标题文本
    -- mw.log('Matching rules against title text:', titleText)

    if not titleText or titleText == '' then
        -- mw.log('Title text is empty, no rules will match.')
        return filteredRules
    end

    local trie = Matcher.buildRuleTrie(rulesList)
    local matchedIndices = Matcher.matchTextWithTrie(titleText, trie)

    for index, rule in ipairs(rulesList) do
        if matchedIndices[index] then
            table.insert(filteredRules, rule)
            -- mw.log('Rule matched title text and kept:', rule)
        end
    end

    return filteredRules
end


return Matcher
模組:Conversion rule extractor/Matcher

公共函数

filterRules

filterRulesAgainstTitleText

内部函数

extractRuleSources

buildRuleTrie

matchTextWithTrie