imported>PexEric：←建立内容为“-- Module:Conversion_rule_extractor/Matcher -- 子模块：负责匹配规则与目标页面内容 local Matcher = {} -- 工具函数：从规则字符串中提取需要匹配的源文本 -- 例如：'zh-cn:图尔库;zh-tw:土庫;' -> {"图尔库", "土庫"} -- 例如：'巨集=>zh-cn:宏;' -> {"巨集"} local function extractRuleSources(ruleString) local sources = {} local sourceSet = {} -- 用于去重 -- 移除外层包裹（…”的新页面

2025-05-03T17:05:46Z

←建立内容为“-- Module:Conversion_rule_extractor/Matcher -- 子模块：负责匹配规则与目标页面内容 local Matcher = {} -- 工具函数：从规则字符串中提取需要匹配的源文本 -- 例如：'zh-cn:图尔库;zh-tw:土庫;' -> {"图尔库", "土庫"} -- 例如：'巨集=>zh-cn:宏;' -> {"巨集"} local function extractRuleSources(ruleString) local sources = {} local sourceSet = {} -- 用于去重 -- 移除外层包裹（…”的新页面

新页面

-- Module:Conversion_rule_extractor/Matcher
-- 子模块：负责匹配规则与目标页面内容

local Matcher = {}

-- 工具函数：从规则字符串中提取需要匹配的源文本
-- 例如：'zh-cn:图尔库;zh-tw:土庫;' -> {"图尔库", "土庫"}
-- 例如：'巨集=>zh-cn:宏;' -> {"巨集"}
local function extractRuleSources(ruleString)
local sources = {}
local sourceSet = {} -- 用于去重

-- 移除外层包裹（如果存在，尽管 Extractor 通常会清理掉）
ruleString = ruleString:match('^%-{.-|(.*)}%-$') or ruleString

for part in mw.text.gsplit(ruleString, ';') do
part = mw.text.trim(part)
if part ~= '' then
local source
local unidirectionalMatch = part:match('^([^=]-)=>') -- 检查单向规则 A=>B
local bidirectionalMatch = part:match('^%w+%-%w+:(.+)') -- 检查双向规则 lang:Text
local simpleBidirectionalMatch = part:match('^([^:]+):(.+)') -- 检查简单的双向规则 Text:Variant (不太标准，但可能存在)
local fallbackMatch = part:match('^([^=:]+)') -- 如果没有=>或:，取整个部分作为源？(可能不太安全，但作为后备)

if unidirectionalMatch then
source = mw.text.trim(unidirectionalMatch)
elseif bidirectionalMatch then
source = mw.text.trim(bidirectionalMatch)
elseif simpleBidirectionalMatch then
-- 对于 Text:Variant 格式，我们假设 Text 是要匹配的源
source = mw.text.trim(simpleBidirectionalMatch)
elseif fallbackMatch and not part:find('=') and not part:find(':') then
-- 只有在没有 => 和 : 时才考虑整个部分作为源，例如 "單純文字" 这种无效但可能存在的规则
source = mw.text.trim(fallbackMatch)
end

if source and source ~= '' and not sourceSet[source] then
table.insert(sources, source)
sourceSet[source] = true
-- mw.log('Extracted source:', source, 'from part:', part)
-- else
-- mw.log('Could not extract source from part:', part)
end
end
end
-- mw.logObject('Extracted sources for rule "' .. ruleString .. '":', sources)
return sources
end

-- 构建用于匹配的Trie树 (改编自 Module:NoteTA-lite)
-- 输入: rulesList - 一个包含规则字符串的列表
-- 输出: Trie树，叶子节点存储规则在 rulesList 中的索引列表
function Matcher.buildRuleTrie(rulesList)
local trie = {}
local ruleSourcesMap = {} -- 存储每个源文本对应的规则索引列表 { ["源文本"] = {idx1, idx2} }

for index, ruleString in ipairs(rulesList) do
local sources = extractRuleSources(ruleString)
for _, source in ipairs(sources) do
if not ruleSourcesMap[source] then
ruleSourcesMap[source] = {}
end
table.insert(ruleSourcesMap[source], index)
-- mw.log('Mapping source:', source, 'to index:', index)
end
end

-- 构建Trie
for source, indices in pairs(ruleSourcesMap) do
local currentNode = trie
-- 使用 mw.ustring 处理 UTF-8 字符
for i = 1, mw.ustring.len(source) do
local char = mw.ustring.sub(source, i, i)
currentNode[char] = currentNode[char] or {}
currentNode = currentNode[char]
end
-- 在叶子节点存储规则索引列表
currentNode.indices = indices
-- mw.log('Added indices to Trie node for source:', source, indices)
end

return trie
end

-- 使用Trie树在文本中查找匹配的规则 (改编自 Module:NoteTA-lite)
-- 输入: text - 要搜索的文本内容
-- 输入: trie - Matcher.buildRuleTrie 构建的Trie树
-- 输出: matchedIndices - 一个集合 (table)，key 是匹配到的规则索引，value 是 true
function Matcher.matchTextWithTrie(text, trie)
local matchedIndices = {}
if not text or text == '' then return matchedIndices end

local len = mw.ustring.len(text)
for i = 1, len do
local currentNode = trie
for j = i, len do
local char = mw.ustring.sub(text, j, j)
if not currentNode[char] then
break -- 没有后续匹配
end
currentNode = currentNode[char]
-- 检查当前节点是否是某个源文本的结尾
if currentNode.indices then
-- mw.log('Match found ending at pos', j, 'for source ending with char', char)
for _, index in ipairs(currentNode.indices) do
if not matchedIndices[index] then
-- mw.log('Recording match for rule index:', index)
matchedIndices[index] = true
end
end
-- 继续检查更长的匹配
end
end
end
-- mw.logObject('Indices matched in text:', matchedIndices)
return matchedIndices
end

-- 主函数：筛选规则列表，只保留在目标页面内容中能匹配到的规则
-- 输入: rulesList - 包含规则字符串的列表
-- 输入: targetPageTitleOrText - 目标页面的标题字符串或直接的文本内容
-- 输出: filteredRules - 只包含匹配到的规则字符串的列表
function Matcher.filterRules(rulesList, targetPageTitleOrText)
local filteredRules = {}
if not rulesList or #rulesList == 0 then
return filteredRules
end

local textContent
if type(targetPageTitleOrText) == 'string' then
-- 检查是页面标题还是直接文本
local titleObj = mw.title.new(targetPageTitleOrText)
if titleObj and titleObj.exists then
-- 是有效的页面标题，获取内容
textContent = titleObj:getContent()
-- mw.log('Matching against content of page:', targetPageTitleOrText)
else
-- 认为是直接的文本内容
textContent = targetPageTitleOrText
-- mw.log('Matching against provided text string.')
end
else
-- 如果传入的是 title 对象
if targetPageTitleOrText and targetPageTitleOrText.getContent then
textContent = targetPageTitleOrText:getContent()
-- mw.log('Matching against content of provided title object:', targetPageTitleOrText.prefixedText)
else
-- mw.log('Invalid target provided for matching.')
return filteredRules -- 无法获取内容，返回空
end
end

if not textContent or textContent == '' then
-- mw.log('Target content is empty, no rules will match.')
return filteredRules -- 没有内容可匹配
end

local trie = Matcher.buildRuleTrie(rulesList)
local matchedIndices = Matcher.matchTextWithTrie(textContent, trie)

for index, rule in ipairs(rulesList) do
if matchedIndices[index] then
table.insert(filteredRules, rule)
-- mw.log('Rule matched and kept:', rule)
-- else
-- mw.log('Rule did not match:', rule)
end
end

return filteredRules
end

-- 专门用于匹配标题的函数，只使用标题文本进行匹配
function Matcher.filterRulesAgainstTitleText(rulesList, pageTitle)
local filteredRules = {}
if not rulesList or #rulesList == 0 then
return filteredRules
end

local titleObj = mw.title.new(pageTitle)
if not titleObj then
-- mw.log('Invalid title provided for title text matching:', pageTitle)
return filteredRules
end

local titleText = titleObj.text -- 获取不含名字空间的标题文本
-- mw.log('Matching rules against title text:', titleText)

if not titleText or titleText == '' then
-- mw.log('Title text is empty, no rules will match.')
return filteredRules
end

local trie = Matcher.buildRuleTrie(rulesList)
local matchedIndices = Matcher.matchTextWithTrie(titleText, trie)

for index, rule in ipairs(rulesList) do
if matchedIndices[index] then
table.insert(filteredRules, rule)
-- mw.log('Rule matched title text and kept:', rule)
end
end

return filteredRules
end

return Matcher

Module:Conversion rule extractor/Matcher - 版本历史