#!/usr/bin/perl # # ng-analyse: analyse a newsgroup # # Donated into the public domain by Chris Lightfoot , # May 1999 # # Run with parameters server name, newsgroup name # require News::NNTPClient; sub munge_email { my $email = shift(@_); $email =~ s/^([^@]+)@([^.]{1,5}).*$/$1 @ $2.../; return $email; } # random config options $server = $ARGV[0]; $newsgroup = $ARGV[1]; $width = 500; $max_lines = 20; $nntp_serv = new News::NNTPClient($server); $start = time - 7 * 24 * 3600; #$start = time - 36000; # we map email addresses to author names %author_names = undef; # we map all forms of a subject to a "canonical form" %subject_names = undef; # we classify by: articles posted by author, articles posted by thread, # and original (referenceless) articles %articles_by_author = undef; %articles_by_thread = undef; %original_by_author = undef; foreach $article ($nntp_serv->newnews($newsgroup, $start)) { # retrieve headers for this article @h = $nntp_serv->head($article); # turn them into a proper structure %headers = {undef}; foreach (@h) { chomp($h); ($a, $b) = $_ =~ /(^[^:]+)\:(.+)/; $a =~ tr/[A-Z]/[a-z]/; $headers{$a} = $b; } # now accumulate statistics about this article $f = $headers{from}; $email = $name = undef; if ($f eq undef || $f =~ /^ *$/) { # article was bogus (no From:); reject it $articles_by_author{"(malformed articles)"}++; } else { if ($f =~ /<([^>]+)>/) { $email = $1; $f =~ s/<([^>]+)>//; ($name) = $f =~ /^ *(.+?) *$/; } elsif ($f =~ /\(([^)]+)\)/) { $name = $1; $f =~ s/\(([^)]+)\)//g; ($email) = $f =~ /^ *(.+?) *$/; } else { $email = $f; } $email =~ s/[\" ]//g; $name =~ s/[\"]//g; $name =~ s/^ +$//; # add this article to the statistics, creating a map for its # author if required $articles_by_author{$email}++; if ($author_names{$email} eq undef) { $author_names{$email} = $name; } if ($headers{references} !~ /\<[^>]+\>/) { $original_by_author{$email}++; # print " (original article)\n"; } } $subject = $headers{subject}; if ($subject eq undef) { $articles_by_thread{"(malformed articles)"}++; } else { # strip out initial "Re:" and spaces $subject =~ s/^ *(?:[Rr][Ee]\:|) *//; $isubject = $subject; $isubject =~ tr/[A-Z]/[a-z]/; $articles_by_thread{$isubject}++; if ($subject_names{$isubject} eq undef) { $subject_names{$isubject} = $subject; } } } # start up the HTML $time = localtime; print < $newsgroup analysis

Newsgroup analysis for $newsgroup

This was produced using an ugly perl / News::NNTPClient hack, copyright (c) 1999 Chris Lightfoot (home page).

This is a compilation of publically-accessible information. It is provided for convenience only, and no assertion of its accuracy is made.

Analysis created at $time.

[ articles by author | original articles by poster | articles by thread ]

EOF # sort the damn things and print them out print <

Top authors posting (original and followup articles)

It is assumed that each author has one unique email address. Note that addresses have been munged slightly to prevent an obvious spam scam.

EOF @sorted_list = sort {$articles_by_author{$b} <=> $articles_by_author{$a}} keys %articles_by_author; for ($i = 0; $i < $max_lines && $i < @sorted_list && $articles_by_author{$sorted_list[$i]} > 1; $i++) { $k = $sorted_list[$i]; $n = $i + 1; if (!$i) { $max = $articles_by_author{$k}; } $w = int(($width / $max) * $articles_by_author{$k}); $e = munge_email($k); print < EOF } print <
EOF # original postings only print <

Top authors posting (original articles only)

"Original" articles are those with no References: header.

rank author postings
$n $author_names{$k}
$e
$articles_by_author{$k} Bar graph
EOF @sorted_list = sort {$original_by_author{$b} <=> $original_by_author{$a}} keys %original_by_author; for ($i = 0; $i < $max_lines && $i < @sorted_list && $original_by_author{$sorted_list[$i]} > 1; $i++) { $k = $sorted_list[$i]; $n = $i + 1; if (!$i) { $max = $original_by_author{$k}; } $w = int(($width / $max) * $original_by_author{$k}); $e = munge_email($k); print < EOF } print <
EOF # threads print <

Most popular threads

Threads are assessed heuristically, on the basis of Subject: header.

rank author postings
$n $author_names{$k}
$e
$original_by_author{$k} Bar graph
EOF @sorted_list = sort {$articles_by_thread{$b} <=> $articles_by_thread{$a}} keys %articles_by_thread; for ($i = 0; $i < $max_lines && $i < @sorted_list && $articles_by_thread{$sorted_list[$i]} > 1; $i++) { $k = $sorted_list[$i]; $n = $i + 1; if (!$i) { $max = $articles_by_thread{$k}; } $w = int(($width / $max) * $articles_by_thread{$k}); $e = munge_email($k); print < EOF } print <
EOF print < EOF
rank subject postings
$n $subject_names{$k} $articles_by_thread{$k} Bar graph