Jump to content

Wikipedia:Duplicated sections/script

From Wikipedia, the free encyclopedia
The printable version is no longer supported and may have rendering errors. Please update your browser bookmarks and please use the default browser print function instead.
# Hot pipes
$| = 1;

# This script is expecting entries.txt to be a relatively database
# dump that has been pre-processed to put each page on line by itself.

# On 31 July 2005, this script ran on a 1.2GHz i686 laptop with ~700MB
# RAM in about 20 minutes.  Not using the dupHeaders() filter will
# cause it to take probably about 5 hours or more.

# The author of this script is Christopher Beland, User:Beland on
# en.wikipedia.org.  It is hereby released into the Public Domain.
# Feel free to use it for any purpose whatsoever.

use strict;

main();

sub main
{

    my ($cur_id, $cur_namespace, $cur_title, $cur_text, @junk, $line,
	$cur_namespace_name, $i, $j, @tokens, $printed, $chain);

    unless (-d "./todo")
    {
	mkdir "./todo";
    }

    open (ENTRIES, "<data/entries.txt")
	|| die "Cannot read data/entries.txt";
    open (DUPHEAD, ">todo/duplicate-chunks.txt")
	|| die "Cannot write todo/blank-pages.txt" ;

    while (<ENTRIES>)
    {
	if (++$j % 100 == 0)
	{
	    print STDERR $j."\r";
	}

	$line = $_;
	
	eval("\@tokens = $line");
		
	($cur_id, $cur_namespace, $cur_title, $cur_text, @junk)
	    = @tokens;

	unless (dupHeaders($cur_text) == 1)
	{
	    next;
	}

	if ($cur_namespace == -2)
	{
	    $cur_namespace_name = "Media:";
	}
	elsif ($cur_namespace == -1)
	{
	    $cur_namespace_name = "Special:";
	}
	elsif ($cur_namespace == 0)
	{
	    $cur_namespace_name = "";
	}
	elsif ($cur_namespace == 1)
	{
	    $cur_namespace_name = "Talk:";
	}
	elsif ($cur_namespace == 2)
	{
	    $cur_namespace_name = "User:";
	}
	elsif ($cur_namespace == 3)
	{
	    $cur_namespace_name = "User_talk:";
	}
	elsif ($cur_namespace == 4)
	{
	    $cur_namespace_name = "Wikipedia:";
	}
	elsif ($cur_namespace == 5)
	{
	    $cur_namespace_name = "Wikipedia_talk:";
	}
	elsif ($cur_namespace == 6)
	{
	    $cur_namespace_name = ":Image:";
	}
	elsif ($cur_namespace == 7)
	{
	    $cur_namespace_name = "Image_talk:";
	}
	elsif ($cur_namespace == 8)
	{
	    $cur_namespace_name = "MediaWiki:";
	}
	elsif ($cur_namespace == 9)
	{
	    $cur_namespace_name = "MediaWiki_talk:";
	}
	elsif ($cur_namespace == 10)
	{
	    $cur_namespace_name = "Template:";
	}
	elsif ($cur_namespace == 11)
	{
	    $cur_namespace_name = "Template_talk:";
	}
	elsif ($cur_namespace == 12)
	{
	    $cur_namespace_name = "Help:";
	}
	elsif ($cur_namespace == 13)
	{
	    $cur_namespace_name = "Help_talk:";
	}
	elsif ($cur_namespace == 14)
	{
	    $cur_namespace_name = ":Category";
	}
	elsif ($cur_namespace == 15)
	{
	    $cur_namespace_name = "Category_talk:";
	}

	# Remove leading and trailing 's.
	$cur_title =~ s/^\'//;
	$cur_title =~ s/\'$//;
	# Remove leading and trailing whitespace
	$cur_title =~ s/^\s*//;
	$cur_title =~ s/\s*$//;

	$cur_text =~ s/\\n/ /g;
	$cur_text =~ s/\s+/ /g;

	my (%chains, @chunks, $i, $per, $numberRepeated);

	@chunks = split (" ", $cur_text);
	
	while (@chunks > 3)
	{
	    $chain = $chunks[-1]." ".$chunks[-2]." ".$chunks[-3];
	    $chains{$chain}++;
	    pop(@chunks);

	    # Note: pop from the rear is a bjillion times more
	    # efficient than unloading manually from the front.

	    $i++;
	}

#	print DUPHEAD "* [[".$cur_namespace_name.$cur_title."]] $i\n";

	$printed = 0;

	foreach $chain (keys(%chains))
	{
	    if ($chains{$chain} > 1)
	    {
		if ($printed == 0)
		{
		    print DUPHEAD "* [[".$cur_namespace_name.$cur_title."]]";
		    $printed = 1;
		}
#		print DUPHEAD $chains{$chain}.": ".$chain."\n";
		$numberRepeated++
	    }
	}

	if ($printed == 1)
	{
	    $per = int(($numberRepeated / $i) * 100);
	    print DUPHEAD " ${per}% repeated - $numberRepeated out of $i triplets\n";
	}

    }
    close (ENTRIES);
    close (DUPHEAD);
}


sub dupHeaders
{
    my ($text, %headers, $line);
    
    $text = $_[0];
    
    unless ($text =~ m/=/)
    {
	# No headers means no duplicate headers
	return (0);
    }

    $text =~ s/\\n/\n/g;
    
    foreach $line (split ("\n", $text))
    {
	if ($line =~ m/^\s*\=/)
	{
	    $headers{$line}++;		
	}
    }
    
    foreach $line (keys(%headers))
    {
	if ($headers{$line} > 1)
	{
	    # Found a duplicated header
	    return(1);
	}
    }

    # Didn't return, so must not have found any duplicate headers
    return(0);
}


print `sort -nr -k3 todo/duplicate-chunks.txt > todo/duplicate-chunks-sorted.txt`