模組:Conversion rule extractor

模块文档[查看] [编辑] [历史] [清除缓存]

此模块已评为alpha版，可接受第三方输入，并可用于少量页面以检查是否存在问题，但需要受到检查。欢迎提供新功能或修改其输入输出机制的建议。

此模块使用Lua语言：

本模块用于提取指定页面的字词转换规则（包括手动规则和公共转换组规则），并根据需要筛选出能在特定目标页面或标题上匹配使用的规则，最终以不同格式输出。

子模块

Module:Conversion_rule_extractor/Extractor – 用于提取规则
Module:Conversion_rule_extractor/Matcher – 用于匹配规则

函数

getRules

获取指定来源页面的所有内容转换规则（手动+公共组），并筛选出能在目标页面内容中匹配到的规则。

调用方式： {{#invoke:Conversion_rule_extractor|getRules|from=来源页面|to=目标页面|flag=输出格式}}

参数：

from 或匿名参数 1: (必须) 字符串，规则来源页面的标题。
to: (可选) 字符串，用于匹配规则的目标页面标题。默认为调用模块的当前页面。
flag: (可选) 字符串，指定输出规则的旗标。
- H (默认): 输出为-{H|规则;}-格式。
- raw: 直接输出规则文本，每条规则占一行。
- 其他值 (如 A, D): 输出为-{值|规则;}-格式。

返回值： 字符串，格式化后的、能在目标页面匹配到的内容转换规则。如果来源页不存在或无匹配规则，返回空字符串。如果缺少 from 参数，返回错误提示信息。

getTitleRule

获取应用于指定来源页面标题的转换规则。

如果来源页面有显式的标题 (T) 规则，则使用该规则。
如果没有显式标题规则，则尝试将来源页面的所有内容规则与其标题文本进行匹配，使用匹配到的规则。

调用方式： {{#invoke:Conversion_rule_extractor|getTitleRule|from=来源页面|flag=输出格式|type=输出类型}}

参数：

from 或匿名参数 1: (必须) 字符串，规则来源页面的标题。
flag: (可选) 字符串，指定输出规则的旗标（当type不为context时生效）。
- H (默认): 输出为-{H|规则;}-格式。
- raw: 直接输出规则文本，每条规则占一行。
- 其他值: 输出为-{值|规则;}-格式。如果规则来源于内容规则匹配，即使指定flag='T'，也会强制使用H。
type: (可选) 字符串，指定输出类型。
- context: (特殊模式) 如果存在显式T规则，输出为-{T规则;}-。如果不存在显式T规则，但有内容规则匹配标题，输出为-{H|匹配的内容规则;}-原始标题。
- 其他值或省略: 按flag参数指定的格式输出规则本身。

返回值： 字符串，格式化后的标题转换规则或带规则的标题文本。如果来源页不存在或无适用规则，返回空字符串。如果缺少from参数或来源页不存在，返回错误提示信息。

参见

{{NoteTA-lite}}

上述文档嵌入自Module:Conversion rule extractor/doc。
编者可以在本模块的沙盒和测试样例页面进行实验。
本模块的子页面。

-- Module:ConversionRuleExtractor
-- Author: [Your Username]
-- Date: 2023-10-27 (修订 2023-10-30, 修正标题创建/类型错误, getFullText日志, raw换行)
-- Description: 从指定页面提取全文或标题字词转换规则，并按要求格式化输出。
-- Requires: Module:Template parameter value, Module:Arguments, Module:CGroup/*, mw.title, mw.loadData, mw.text

local p = {}
local TPV = require('Module:Template parameter value')
local Arguments = require('Module:Arguments') -- 用于模板入口点
local mw_text = require('mw.text') -- 用于分割和裁剪

-- 定义 NoteTA 模板名称及其重定向
local NOTE_TA_TEMPLATES = {
    'NoteTA', 'TA', 'NoteAT', 'NoteTA/default', 'NOTETA', 'Note TA', 'Noteta', 'NoteTa', 'NoteTA/lua', '全文字词转换',
    'NoteTA-lite', 'TA-lite', 'TAL', 'TAl'
}

local MAX_LOCAL_RULES = 30
local MAX_GROUP_RULES = 30

--[[--------------------------< 辅助函数 - CGroup 处理 >--------------------------]]
local function loadCGroupData(groupName)
    local moduleTitleStr = 'Module:CGroup/' .. groupName
    local success, data
    local titleObj = mw.title.new(moduleTitleStr)
    if titleObj and titleObj.exists then
        success, data = pcall(mw.loadData, moduleTitleStr)
        if success and type(data) == 'table' then
            return data
        else
            mw.log('ConversionRuleExtractor: Failed to load or parse CGroup module: ' .. moduleTitleStr .. (success and ' (invalid data type)' or ' (load error)'))
            return nil
        end
    end
    return nil
end

local function extractRulesFromCGroupData(data)
    local rules = {}
    if data and data.content and type(data.content) == 'table' then
        for _, item in ipairs(data.content) do
            if type(item) == 'table' and item.type == 'item' and type(item.rule) == 'string' and item.rule ~= '' then
                table.insert(rules, item.rule)
            end
        end
    end
    return rules
end

--[[--------------------------< 辅助函数 - 规则解析与过滤 >--------------------------]]
local function extractSourceTerms(ruleString)
    local sources = {}
    local rulePart = ruleString
    local arrowPos = string.find(rulePart, '=>', 1, true)
    if arrowPos then
        rulePart = mw_text.trim(string.sub(rulePart, 1, arrowPos - 1))
    end
    for segment in mw_text.gsplit(rulePart, ';') do
        segment = mw_text.trim(segment)
        if segment ~= '' then
            local term
            local colonPos = string.find(segment, ':', 1, true)
            if colonPos then
                term = mw_text.trim(string.sub(segment, colonPos + 1))
            else
                term = segment
            end
            if term ~= '' then
                term = mw_text.strip(term) -- 去除HTML标记
                if term ~= '' then
                    table.insert(sources, term)
                end
            end
        end
    end
    return sources
end

local function filterGroupRulesByText(groupRules, textToMatch)
    if not textToMatch or textToMatch == '' or not groupRules or #groupRules == 0 then
        return {}
    end
    local filteredRules = {}
    local cleanTextToMatch = mw_text.strip(textToMatch)
    if cleanTextToMatch == '' then return {} end

    for _, ruleString in ipairs(groupRules) do
        local sourceTerms = extractSourceTerms(ruleString)
        local matched = false
        for _, term in ipairs(sourceTerms) do
            if string.find(cleanTextToMatch, term, 1, true) then
                matched = true
                break
            end
        end
        if matched then
            table.insert(filteredRules, ruleString)
        end
    end
    return filteredRules
end

--[[--------------------------< 辅助函数 - 格式化输出 >--------------------------]]
local function formatRules(rules, flag)
    if not rules or #rules == 0 then
        return ''
    end
    flag = (type(flag) == 'string' and flag ~= '') and flag or 'H'

    if flag == 'raw' then
        return table.concat(rules, "\n")
    else
        local wrapped_rules = {}
        for _, rule in ipairs(rules) do
            table.insert(wrapped_rules, "-{" .. flag .. "|" .. rule .. "}-")
        end
        return table.concat(wrapped_rules, "")
    end
end

--[[--------------------------< 核心获取函数 >--------------------------]]
function p._internal_fetchAllRules(pageTitleString)
    local allRules = {
        title = nil,
        ['local'] = {},
        groups = {} -- { {name=..., rules={...}}, ... }
    }
    local foundTemplate = false
    local titleObjCheck = mw.title.new(pageTitleString) -- 仅用于日志记录标题有效性
    if not titleObjCheck then
         mw.log('ConversionRuleExtractor: Potentially invalid page title provided for fetching rules: ' .. pageTitleString)
    end

    -- 1. 获取标题规则 (T)
    local success_t, titleRule = TPV.getParameter(pageTitleString, NOTE_TA_TEMPLATES, 'T')
    if success_t then
        if titleRule and titleRule ~= '' then
             allRules.title = titleRule
             foundTemplate = true
        end
    else
         mw.log('ConversionRuleExtractor: TPV.getParameter failed for T on page ' .. pageTitleString .. ': ' .. tostring(titleRule))
    end

    -- 2. 获取本地规则 (1..MAX_LOCAL_RULES)
    for i = 1, MAX_LOCAL_RULES do
        local paramName = tostring(i)
        local success_l, localRule = TPV.getParameter(pageTitleString, NOTE_TA_TEMPLATES, paramName)
        if success_l then
            if localRule and localRule ~= '' then
                table.insert(allRules['local'], localRule)
                foundTemplate = true
            end
        else
            if localRule == "No valid template found" and not foundTemplate then
                 if i == 1 then return nil end
             end
             break
        end
    end

    -- 3. 获取组规则 (G1..MAX_GROUP_RULES)
    for i = 1, MAX_GROUP_RULES do
        local paramName = 'G' .. i
        local success_g, groupName = TPV.getParameter(pageTitleString, NOTE_TA_TEMPLATES, paramName)
        if success_g then
            if groupName and groupName ~= '' then
                foundTemplate = true
                local groupData = loadCGroupData(groupName)
                if groupData then
                     local rulesFromGroup = extractRulesFromCGroupData(groupData)
                     if #rulesFromGroup > 0 then
                         -- 存储组名和该组的所有规则
                         table.insert(allRules.groups, { name = groupName, rules = rulesFromGroup })
                         -- mw.log('ConversionRuleExtractor: Fetched ' .. #rulesFromGroup .. ' rules for group ' .. groupName .. ' on page ' .. pageTitleString) -- 可选调试
                     end
                end
            end
        else
             if groupName == "No valid template found" and not foundTemplate then
                 if i == 1 then return nil end
             end
             break
        end
    end

    if foundTemplate then
        return allRules
    else
        return nil
    end
end

--[[--------------------------< 公开函数 >--------------------------]]

function p.getFullTextRules(pageTitle, flag)
    local pageTitleString
    if type(pageTitle) == 'string' then
        pageTitleString = pageTitle
    elseif type(pageTitle) == 'userdata' and getmetatable(pageTitle) == 'mw.title' then
         pageTitleString = pageTitle.fullText
    else
        return '<span class="error">错误：无效的页面标题类型。</span>'
    end

    local allRulesData = p._internal_fetchAllRules(pageTitleString)
    if not allRulesData then
        return '' -- 未找到规则则返回空
    end

    local combinedRules = {}
    local log_details = {} -- 用于日志记录

    -- 添加本地规则
    if allRulesData['local'] and #allRulesData['local'] > 0 then
        for _, rule in ipairs(allRulesData['local']) do table.insert(combinedRules, rule) end
        table.insert(log_details, "Local rules: " .. #allRulesData['local'])
    end

    -- 添加 *所有* 指定的公共转换组的 *所有* 规则
    if allRulesData.groups then
        for _, groupInfo in ipairs(allRulesData.groups) do
            if groupInfo.rules and #groupInfo.rules > 0 then
                for _, rule in ipairs(groupInfo.rules) do table.insert(combinedRules, rule) end
                table.insert(log_details, "Group '" .. groupInfo.name .. "': " .. #groupInfo.rules .. " rules")
            end
        end
    end

    -- 添加日志，明确显示 getFullTextRules 包含了哪些规则（数量）
    mw.log('ConversionRuleExtractor: getFullTextRules for page "' .. pageTitleString .. '" combined ' .. #combinedRules .. ' rules. Details: ' .. table.concat(log_details, '; '))

    return formatRules(combinedRules, flag)
end

function p.getTitleRules(pageTitle, flag, outputType, frame)
    local pageTitleString
    local titleObj

    if type(pageTitle) == 'string' then
        pageTitleString = mw_text.trim(pageTitle)
        if pageTitleString == '' then
             return '<span class="error">错误：页面标题不能为空字符串。</span>'
        end
        -- mw.log('ConversionRuleExtractor: Attempting to create title object for string: "' .. pageTitleString .. '"') -- 保留日志
        local success_create, result_or_err = pcall(mw.title.new, pageTitleString)

        if not success_create then
            mw.log('ConversionRuleExtractor: pcall to mw.title.new failed for string: "' .. pageTitleString .. '". Error: ' .. tostring(result_or_err))
            return '<span class="error">错误：创建标题对象时调用失败 (' .. tostring(result_or_err) .. ')。</span>'
        -- 修正：直接检查返回的是否为 nil 或不是 userdata
        elseif not result_or_err or type(result_or_err) ~= 'userdata' then
             -- 如果返回的是 table，使用 mw.dumpObject 记录详细信息
             local returned_value_str = type(result_or_err) == 'table' and mw.dumpObject(result_or_err) or tostring(result_or_err)
             mw.log('ConversionRuleExtractor: mw.title.new succeeded but returned unexpected type (' .. type(result_or_err) .. ') or nil for string: "' .. pageTitleString .. '". Value: '.. returned_value_str)
             return '<span class="error">错误：创建标题对象成功，但返回类型无效 (' .. type(result_or_err) .. ')。</span>'
        else
            -- mw.log('ConversionRuleExtractor: Successfully created title object for string: "' .. pageTitleString .. '"') -- 保留成功日志
            titleObj = result_or_err
        end
    elseif type(pageTitle) == 'userdata' and getmetatable(pageTitle) == 'mw.title' then
         titleObj = pageTitle
         pageTitleString = titleObj.fullText
         if not pageTitleString or pageTitleString == '' then
             mw.log('ConversionRuleExtractor: Input title object has empty fullText.')
             return '<span class="error">错误：输入的标题对象无效。</span>'
         end
         -- mw.log('ConversionRuleExtractor: Using provided title object for: "' .. pageTitleString .. '"') -- 保留日志
    else
        return '<span class="error">错误：无效的页面标题输入类型。</span>'
    end

    -- 安全获取 titleText
    local titleText
    local success_getText, result_getText = pcall(titleObj.getText, titleObj)
    if not success_getText then
        mw.log('ConversionRuleExtractor: Failed to call getText on title object for page ' .. pageTitleString .. '. Error: ' .. tostring(result_getText))
        return '<span class="error">错误：无法获取标题文本内容。</span>'
    else
        titleText = result_getText
    end

    local allRulesData = p._internal_fetchAllRules(pageTitleString)

    if not allRulesData then
        return (outputType == 'context') and titleText or ''
    end

    -- 处理 outputType = 'context'
    if outputType == 'context' then
        if not frame then
           return '<span class="error">错误：type=\'context\' 需要 frame 对象。</span>'
        end
        if allRulesData.title and allRulesData.title ~= '' then
            local rule_wikitext = "-{T|" .. allRulesData.title .. "}-"
            local success_preprocess, result = pcall(frame.preprocess, frame, rule_wikitext)
            if success_preprocess then
                return result
            else
                mw.log('ConversionRuleExtractor: frame:preprocess failed for T rule on page ' .. pageTitleString .. ': ' .. tostring(result))
                return '<span class="error">处理T规则时出错。</span>'
            end
        else
            local applicableRules = {}
            if allRulesData['local'] then
                for _, rule in ipairs(allRulesData['local']) do table.insert(applicableRules, rule) end
            end
            local allGroupRules = {}
            if allRulesData.groups then
                for _, groupInfo in ipairs(allRulesData.groups) do
                    if groupInfo.rules then
                        for _, rule in ipairs(groupInfo.rules) do table.insert(allGroupRules, rule) end
                    end
                end
            end
            local filteredGroupRules = filterGroupRulesByText(allGroupRules, titleText)
            for _, rule in ipairs(filteredGroupRules) do table.insert(applicableRules, rule) end

            if #applicableRules == 0 then
                return titleText
            else
                local rules_wikitext = formatRules(applicableRules, 'H')
                local success_preprocess, result = pcall(frame.preprocess, frame, rules_wikitext .. titleText)
                 if success_preprocess then
                    return result
                else
                    mw.log('ConversionRuleExtractor: frame:preprocess failed for H rules on page ' .. pageTitleString .. ': ' .. tostring(result))
                    return '<span class="error">应用H规则时出错。</span>'
                end
            end
        end
    else
        -- 处理格式化输出 (非 context)
        local combinedRules = {}
        if allRulesData.title and allRulesData.title ~= '' then
             table.insert(combinedRules, allRulesData.title)
        end
        if allRulesData['local'] then
            for _, rule in ipairs(allRulesData['local']) do table.insert(combinedRules, rule) end
        end
        local allGroupRules = {}
        if allRulesData.groups then
            for _, groupInfo in ipairs(allRulesData.groups) do
                if groupInfo.rules then
                    for _, rule in ipairs(groupInfo.rules) do table.insert(allGroupRules, rule) end
                end
            end
        end
        local filteredGroupRules = filterGroupRulesByText(allGroupRules, titleText)
        for _, rule in ipairs(filteredGroupRules) do table.insert(combinedRules, rule) end

        return formatRules(combinedRules, flag)
    end
end

--[[--------------------------< 模板入口点 >--------------------------]]
function p.getFullText(frame)
    local args = Arguments.getArgs(frame)
    local page = args[1] or args.page
    local flag = args.flag

    if not page or page == '' then
        return '<span class="error">错误：必须提供页面标题。</span>'
    end
    local result = p.getFullTextRules(page, flag)
    return result
end

function p.getTitle(frame)
    local args = Arguments.getArgs(frame)
    local page = args[1] or args.page
    local flag = args.flag
    local outputType = args.type

    if not page or page == '' then
         return '<span class="error">错误：必须提供页面标题。</span>'
    end
    local result = p.getTitleRules(page, flag, outputType, frame)
    return result
end

return p