#!/usr/bin/perl
use HTML::TreeBuilder;
use HTTP::Cookies;
use LWP;
my $group = "rec.games.corewar";
my $save_file = "$group.raw";
my $ua = new LWP::UserAgent;
$ua->timeout(12);
$ua->agent("Mozilla/4.76 [en] (Windows NT 5.0; U)");
$ua->cookie_jar(HTTP::Cookies->new);
my $start = 0;
my $next = "STILL GOING";
my %ARTICLES;
while(!($next =~ /THE END/)){
($rl, $rnext) = get_articles($start);
foreach $article (@$rl){
$ARTICLES{$article}++;
}
push(@LIST, @$rl);
$start = @LIST;
print "Next START = $start\n";
$next = $$rnext;
sleep(5 + int rand 15);
}
my $count = keys %ARTICLES;
print "Found $count articles!\n";
my $ret = 0;
foreach $article (keys %ARTICLES){
$success = save_article($article);
if($success eq "False"){
push(@RETRY, $link);
} else {
$ret++;
}
}
print "Retrived $ret articles of $count\n";
foreach $article (@RETRY){
$success = save_article($article);
if($success eq "False"){
push(@RETRY, $link);
} else {
$ret++;
}
}
print "Retrived $ret articles of $count (After retrying)\n";
print "Done.";
sub save_article {
my $article = shift;
$url = "http://groups-beta.google.com/group/$group/msg/$article?dmode=source";
my $index = new HTTP::Request('GET',$url);
my $response = $ua->request($index);
if($response->is_success){
open(SH, ">>$save_file");
print SH $response->content;
close(SH);
} else {
print "Failed to retrieve original article!";
return("False");
}
}
sub get_articles {
my $start = shift;
my $url = "http://groups-beta.google.com/group/$group?gvc=2&start=$start";
print "Using url $url\n";
my $index = new HTTP::Request('GET',$url);
my $response = $ua->request($index);
my @LIST;
if($response->is_success){
my $root = HTML::TreeBuilder->new;
$root->parse($response->content);
$root->eof();
foreach $h3 ($root->find_by_tag_name('a')){
my $link = $h3->attr('href');
if($link =~ /browse_thread\/thread\/([a-z0-9]+)\//){
#print "Caught Article: $1\n";
push(@LIST, $1);
}
}
if($response->content =~ /No more topics in this group/){
$next_link = "THE END";
print "Found the last articles\n";
} else {
$next_link = "STILL GOING";
}
$root->delete();
return\(@LIST, $next_link);
} else {
print "Failed to get $url\n";
return \(@EMPTY_LIST, "NO URL");
}
}