Scrapes various pastebin/pastebin-alikes and saves them to a site-specific folder under the paste's ID.
#!/usr/bin/perl
use warnings; use strict; use LWP::Simple;
print "What would you like to scrape?\n1 - Pastebin.com\n2 - Pastebin.ca\n3 - Both\n4 - GitPaste(all)\n5 - Pastie.org\n";
print "Choice[1-5]: ";
chomp(my $option = <>);
if($option eq "1") {
print "\nWorking\n";
Com();
} elsif ($option eq "2") {
print "\nWorking\n";
Ca();
} elsif ($option eq "3") {
print "\nWorking[com]\n";
Com();
print "\nFinished\n";
print "\nWorking [ca]\n";
Ca();
} elsif ($option eq "4") {
Git();
} elsif ($option eq "5") {
Pastie();
} else {
print "Invalid option\n";
}
sub Com {
my $link = 'http://www.pastebin.com/archive';
my $sou = get($link);
open(FILE, ">output");
print FILE $sou;
close(FILE);
Gather1();
open(FILE, "codes1");
chomp(my @codes = <FILE>);
close(FILE);
foreach my $line (@codes) {
if (-e "Com/$line") {
print "Repeat: $line. Skipping. \n";
} else {
my $link = "http://www.pastebin.com/raw.php?i=$line";
print "Grabbing $line\n";
my $sou = get($link) or print "Unable to grab $line\n" and next;
sleep 3; #Leave it. Pastebin will ban you if you grab too quick.
open(FILE, ">Com/$line");
print FILE $sou;
close(FILE);
}
}
`rm output; rm codes1`;
}
sub Gather1 {
`cat output | grep -v -e ads -e settings -e post.php | grep -ao \"\\"/........\\"\" | sed s/\\"\\\\///i | sed s/\\"//g >> codes1`;
}
sub Ca {
my $link2 = 'http://www.pastebin.ca';
my $sou2 = get($link2);
open(FILE, ">output2");
print FILE $sou2;
close(FILE);
Gather2();
open(FILE, "codes2");
chomp(my @codes2 = <FILE>);
close(FILE);
foreach my $line2 (@codes2) {
if (-e "Ca/$line2") {
print "Repeat: $line2. Skipping\n";
} else {
my $link2 = "http://www.pastebin.ca/raw/$line2";
print "Grabbing $line2\n";
my $sou2 = get($link2) or print "Unable to grab $line2\n" and next;
open(FILE, ">Ca/$line2");
print FILE $sou2;
close(FILE);
}
}
`rm output2; rm codes2`;
}
sub Gather2 {
`cat output2 | grep -v -e oid.php -e new.php | grep -ao \"\\"/.......\\"\" | sed s/\\"\\\\///i | sed s/\\"//g >> supaaa; cat supaaa | sort | uniq > codes2; rm supaaa;`;
}
sub Git {
my $paste = 1;
while($paste < 2130) {
if (-e "Git/$paste") {
print "Repeat: $paste. Skipping\n";
$paste++;
} else {
my $link3 = "http://www.gitpaste.com/paste/$paste/raw/";
print "Grabbing $paste\n";
my $sou3 = get($link3) or print "Unable to grab $paste\n" and $paste++ and next;
open(FILE, ">Git/$paste");
print FILE $sou3;
close(FILE);
$paste++;
}
}
}
sub Pastie {
my $link3 = 'http://pastie.org/pastes/';
my $sou3 = get($link3);
open(FILE, ">output3");
print FILE $sou3;
close(FILE);
Gather3();
open(FILE, "codes3");
chomp(my @codes3 = <FILE>);
close(FILE);
foreach my $line3 (@codes3) {
unless ($line3 =~ /\/y\//) {
if (-e "Pastie/$line3") {
print "Repeat: $line3. Skipping\n";
} else {
my $link3 = "http://pastie.org/pastes$line3/text";
print "Grabbing $line3\n";
my $sou3 = get($link3) or print "Unable to grab $line3\n" and next;
open(FILE, ">Pastie/$line3");
print FILE $sou3;
close(FILE);
}
}
}
`rm output3; rm codes3`;
}
sub Gather3 {
`cat output3 | grep -v -i new | grep -ao "pastes/......." | sed s/pastes\//i >> supccc; cat supccc | sort | uniq > codes3; rm supccc`;
}
I'm here.
It scrapes everything on the front page. Except the ones marked "all". Which I figured the sites have so little pastes, we might as well scrape 'em all. Some pastes aren't listed, so this picks them up. Pastebin will ban you from the site without that sleep in there, so. Anyway, you find some interesting stuff.
It seems a tad too automated for something that intricate. My guess is it'll grab whatever's in reach, regardless of content.