跳转到内容

User:What7what8/test.js

维基百科,自由的百科全书

这是本页的一个历史版本,由What7what8留言 | 贡献2025年4月18日 (五) 10:24编辑。这可能和当前版本存在着巨大的差异。

注意:保存之后,你必须清除浏览器缓存才能看到做出的更改。Google ChromeFirefoxMicrosoft EdgeSafari:按住⇧ Shift键并单击工具栏的“刷新”按钮。参阅Help:绕过浏览器缓存以获取更多帮助。
$.when(
    mw.loader.getScript( "https://cdn.jsdelivr.net/npm/opencc-js@1.0.5/dist/umd/full.js"),
    mw.loader.getScript( "https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.4.120/pdf.min.js"),
    mw.loader.getScript( "https://unpkg.com/mammoth@1.4.8/mammoth.browser.min.js"),
    mw.loader.getScript( "https://cdnjs.cloudflare.com/ajax/libs/jschardet/1.4.1/jschardet.min.js"),
    mw.loader.getScript( "https://cdnjs.cloudflare.com/ajax/libs/jsdiff/7.0.0/diff.min.js"),
).then(
    () => {
        // 初始化配置
        pdfjsLib.GlobalWorkerOptions.workerSrc = 'https://cdnjs.cloudflare.com/ajax/libs/pdf.js/3.4.120/pdf.worker.min.js';
        const cc = new OpenCC('t2s');
        const visitedUrl = [];
        const headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0' };

        async function fetchWikiContent() {
            try {
                const bodyContent = $('#bodyContent')[0];
                
                // 提取所有外部链接
                const links = Array.from(bodyContent.querySelectorAll('a[href]'))
                    .map(a => a.href)
                    .filter(href => 
                        href.startsWith('http') && 
                        !href.includes('wikipedia.org')
                    );
                const uniqueLinks = [...new Set(links)];

                // 清理不需要的元素
                bodyContent.querySelectorAll('cite').forEach(cite => cite.remove());
                bodyContent.querySelectorAll('.references').forEach(ref => ref.remove());

                // 提取文本内容
                const rawText = bodyContent.textContent;
                return { rawText, links: uniqueLinks };
            } catch (error) {
                console.error('获取维基内容失败:', error);
                return { rawText: '', links: [] };
            }
        }

        function processContent(text) {
            return cc.convert(text).trim();
        }

function getMatches(source, target) {
            const cleanTarget = target.replace(/\s/g, '');
            const diffs = Diff.diffChars(source, cleanTarget);
            
            let currentMatch = '';
            const matches = [];
            
            diffs.forEach((part) => {
                if (part.added || part.removed) {
                    if (currentMatch.length > 12) {
                        matches.push(currentMatch);
                    }
                    currentMatch = '';
                } else {
                    currentMatch += part.value;
                }
            });

            // 处理最后一个匹配
            if (currentMatch.length > 12) {
                matches.push(currentMatch);
            }

            return matches;
        }

        async function processSource(url) {
            if (visitedUrl.includes(url)) return [];
            visitedUrl.push(url);
            
            try {
                const response = await fetch(url, { headers });
                if (!response.ok) throw new Error(`HTTP错误 ${response.status}`);
                
                const contentType = response.headers.get('Content-Type');
                const buffer = await response.arrayBuffer();
                let text = '';

                // 处理PDF文档
                if (/application\/pdf/i.test(contentType)) {
                    const pdf = await pdfjsLib.getDocument({ data: buffer }).promise;
                    for (let i = 1; i <= pdf.numPages; i++) {
                        const page = await pdf.getPage(i);
                        const content = await page.getTextContent();
                        text += content.items.map(item => item.str).join(' ');
                    }
                }
                // 处理Word文档
                else if (/application\/(msword|vnd.openxmlformats-officedocument)/i.test(contentType)) {
                    const result = await mammoth.extractRawText({ arrayBuffer: buffer });
                    text = result.value;
                }
                // 处理其他文本类型
                else {
                    const arr = new Uint8Array(buffer);
                    const detection = jschardet.detect(arr);
                    const encoding = detection.confidence > 0.6 ? detection.encoding : 'utf-8';
                    
                    try {
                        const decoder = new TextDecoder(encoding, { fatal: true });
                        text = decoder.decode(arr);
                    } catch {
                        const decoder = new TextDecoder('utf-8', { fatal: false });
                        text = decoder.decode(arr);
                    }

                    // 如果是HTML则提取正文
                    if (/text\/html/i.test(contentType)) {
                        const parser = new DOMParser();
                        const doc = parser.parseFromString(text, 'text/html');
                        text = doc.body.textContent;
                    }
                }

                return [processContent(text)];
            } catch (error) {
                console.error(`处理链接失败 [${url}]:`, error);
                return [];
            }
        }
		
        async function copyvio(){

            // 获取维基内容
            const { rawText, links } = await fetchWikiContent();
            const processedWiki = processContent(rawText);
            
            // 处理所有外部链接
            const results = {};
            for (const link of links) {
                const contents = await processSource(link);
                for (const content of contents) {
                    const matches = getMatches(processedWiki, content);
                    if (matches.length) {
                        results[link] = [...new Set([
                            ...(results[link] || []),
                            ...matches
                        ])];
                    }
                }
            }

            // 生成输出结果
            let output = '查重结果:\n\n';
            for (const [url, matches] of Object.entries(results)) {
                output += `URL: ${url}\n匹配内容(${matches.length}条):\n`;
                matches.forEach((match, i) => {
                    output += `${i + 1}. ${match.substring(0, 100)}...\n`;
                });
                output += '\n';
            }

            // 显示并下载结果
            console.log(output);
            const blob = new Blob([output], { type: 'text/plain;charset=utf-8' });
            const a = document.createElement('a');
            a.href = URL.createObjectURL(blob);
            a.download = '查重结果.txt';
            document.body.appendChild(a);
            a.click();
            document.body.removeChild(a);
        }

        // 启动程序
        //copyvio().catch(console.error);
    },
    ( e ) => {
        // A script failed, and is not available
        mw.log.error( e.message ); // => "Failed to load script"
    }
);