Wikipedia:Archiv/Alternative Benutzerstatistik/Programme/mkzsf
Zur Navigation springen
Zur Suche springen
# !/usr/bin/perl
use Digest::MD5 qw(md5_base64);
$bs = "Linux"; # oder "Windows"
sub xmlunesc {
my $text = shift;
$text =~ s/</</sg;
$text =~ s/>/>/sg;
$text =~ s/'/'/sg;
$text =~ s/"/"/sg; #"
$text =~ s/&/&/sg;
$text;
}
sub xmlsiteinfo {
while($xml =~ /<namespace key="(.*?)"(?: \/>|>(.*?)<\/namespace>)/sg) {
$namespace{$2} = $1 if defined $2;
}
$xml = "";
}
sub xmlrevision {
$xml =~ s/<revision>(.*?)<\/revision>//s;
my $revcontent = $1;
$rev = {};
xmlpage() unless defined $page;
$xml = "";
while($revcontent =~ /<(id|timestamp|contributor|minor|comment|text)(?: xml:space="preserve"| type="(.*?)")*(?:\s*\/>|>(.*?)<\/\1>)/sg) {
my ($tag, $type, $content) = ($1, $2, $3);
$content = "" unless defined $content;
if($tag =~ /^text/) {
$rev->{"text"} = xmlunesc($content);
} elsif($tag eq "contributor") {
if($content =~ /<username>(.*?)<\/username>\s*<id>(.*?)<\/id>/) {
$rev->{"user_text"} = xmlunesc($1);
$rev->{"user"} = $2;
} elsif($content =~ /<ip>(.*?)<\/ip>/) {
$rev->{"user_text"} = xmlunesc($1);
$rev->{"user"} = 0;
} else {
$rev->{"user_text"} = "_";
$rev->{"user"} = 0;
}
} elsif($tag eq "comment") {
$rev->{"comment"} = xmlunesc($content);
} elsif($tag eq "timestamp") {
$content =~ /^(....)-(..)-(..)T(..):(..):(..)Z$/;
$rev->{"timestamp"} = "$1$2$3$4$5$6";
} else {
$rev->{$tag} = $content;
}
}
revision();
}
sub xmlpage {
while($xml =~ s/<(title|id|restrictions)(?:\s*\/>|>(.*?)<\/\1>)//s) {
my ($tag, $content) = ($1, $2);
if($tag eq "title") {
$content = xmlunesc($content);
if($content =~ /(.+?):(.+)/ && defined $namespace{$1}) {
$page->{"namespace"} = $namespace{$1};
$content = $2;
} else {
$page->{"namespace"} = 0;
}
}
$page->{$tag} = defined $content ? $content : "";
}
}
sub revision {
my $is_redirect = $rev->{"text"} =~ /^# ?redirect/i;
my $len = length($rev->{"text"});
return unless defined $page->{"title"};
my $text_md5 = md5_base64($rev->{"text"});
my $loeschlink = $rev->{"text"} =~ /\[\[Wikipedia:(Löschkandidaten|Seiten, die gelöscht werden sollten)/s ||
$rev->{"text"} =~ /\{\{(msg:)?(vfd|Lösch|URV)/is;
$page->{"title"} = "_" unless defined $page->{"title"};
$page->{"title"} =~ s/\s/_/sg;
$rev->{"user_text"} =~ s/\s/_/sg;
$rev->{"comment"} = "_" unless defined $rev->{"comment"};
$rev->{"comment"} =~ s/\s/_/sg;
printf ZSF1 "%s %7i %i %i %s %20s %i %s\n", $rev->{"timestamp"}, $len, $is_redirect,
$loeschlink, $text_md5, $rev->{"user_text"}, $page->{"namespace"}, $page->{"title"};
}
sub kategorien {
my $links = $page->{"title"};
while($rev->{"text"} =~ /\[\[Kategorie:([^\|\]]*)/sg) {
my $kat = $1;
$kat =~ s/\s/_/sg;
$links .= " " . $kat;
}
print KATLINKS "$links\n" if $page->{"namespace"}==0;
print KATTREE "$links\n" if $page->{"namespace"}==14;
}
sub sortiere {
my $tmp = $ENV{"LC_ALL"};
$ENV{"LC_ALL"} = "C";
system "sort $_[0] /O $_[1]" if $bs eq "Windows";
system "sort -T ./sort-tmp -S 400M -o $_[1] $_[0]" if $bs eq "Linux";
$ENV{"LC_ALL"} = $tmp;
unlink $_[0];
}
mkdir "sort-tmp" unless -d "sort-tmp";
open ZSF1, ">zsf1";
open KATLINKS, ">katlinks";
open KATTREE, ">kattree";
while(<>) {
$xml .= $_;
if(/^\s*<\/siteinfo>/) {
xmlsiteinfo();
}
next unless defined %namespace;
if(/^\s*<\/revision>/) {
xmlrevision();
} elsif(/^\s*<\/page>/) {
kategorien();
$xml = "";
$page = undef;
}
}
close ZSF1; close KATLINKS; close KATTREE;
sortiere("zsf1", "zsf");
rmdir "sort-tmp";