#!/usr/bin/perl use HTML::TreeBuilder; use HTTP::Cookies; use LWP; my $group = "rec.games.corewar"; my $save_file = "$group.raw"; my $ua = new LWP::UserAgent; $ua->timeout(12); $ua->agent("Mozilla/4.76 [en] (Windows NT 5.0; U)"); $ua->cookie_jar(HTTP::Cookies->new); my $start = 0; my $next = "STILL GOING"; my %ARTICLES; while(!($next =~ /THE END/)){ ($rl, $rnext) = get_articles($start); foreach $article (@$rl){ $ARTICLES{$article}++; } push(@LIST, @$rl); $start = @LIST; print "Next START = $start\n"; $next = $$rnext; sleep(5 + int rand 15); } my $count = keys %ARTICLES; print "Found $count articles!\n"; my $ret = 0; foreach $article (keys %ARTICLES){ $success = save_article($article); if($success eq "False"){ push(@RETRY, $link); } else { $ret++; } } print "Retrived $ret articles of $count\n"; foreach $article (@RETRY){ $success = save_article($article); if($success eq "False"){ push(@RETRY, $link); } else { $ret++; } } print "Retrived $ret articles of $count (After retrying)\n"; print "Done."; sub save_article { my $article = shift; $url = "http://groups-beta.google.com/group/$group/msg/$article?dmode=source"; my $index = new HTTP::Request('GET',$url); my $response = $ua->request($index); if($response->is_success){ open(SH, ">>$save_file"); print SH $response->content; close(SH); } else { print "Failed to retrieve original article!"; return("False"); } } sub get_articles { my $start = shift; my $url = "http://groups-beta.google.com/group/$group?gvc=2&start=$start"; print "Using url $url\n"; my $index = new HTTP::Request('GET',$url); my $response = $ua->request($index); my @LIST; if($response->is_success){ my $root = HTML::TreeBuilder->new; $root->parse($response->content); $root->eof(); foreach $h3 ($root->find_by_tag_name('a')){ my $link = $h3->attr('href'); if($link =~ /browse_thread\/thread\/([a-z0-9]+)\//){ #print "Caught Article: $1\n"; push(@LIST, $1); } } if($response->content =~ /No more topics in this group/){ $next_link = "THE END"; print "Found the last articles\n"; } else { $next_link = "STILL GOING"; } $root->delete(); return\(@LIST, $next_link); } else { print "Failed to get $url\n"; return \(@EMPTY_LIST, "NO URL"); } }