Paste-Scrape

By Ziddykins on Apr 25, 2013

Scrapes various pastebin/pastebin-alikes and saves them to a site-specific folder under the paste's ID.

#!/usr/bin/perl
use warnings; use strict; use LWP::Simple;

print "What would you like to scrape?\n1 - Pastebin.com\n2 - Pastebin.ca\n3 - Both\n4 - GitPaste(all)\n5 - Pastie.org\n";
print "Choice[1-5]: ";
chomp(my $option = <>);
if($option eq "1") {
    print "\nWorking\n";
    Com();
} elsif ($option eq "2") {
    print "\nWorking\n";
    Ca();
} elsif ($option eq "3") {
    print "\nWorking[com]\n";
    Com();
    print "\nFinished\n";
    print "\nWorking [ca]\n";
    Ca();
} elsif ($option eq "4") {
    Git();
} elsif ($option eq "5") {
    Pastie();
} else {
    print "Invalid option\n";
}   

sub Com {
    my $link = 'http://www.pastebin.com/archive';
    my $sou = get($link);
    open(FILE, ">output");
    print FILE $sou;
    close(FILE);
    Gather1();
    open(FILE, "codes1");
    chomp(my @codes = <FILE>);
    close(FILE);
    foreach my $line (@codes) {
        if (-e "Com/$line") {
            print "Repeat: $line. Skipping. \n";
        } else {
            my $link = "http://www.pastebin.com/raw.php?i=$line";
            print "Grabbing $line\n";
            my $sou = get($link) or print "Unable to grab $line\n" and next;
            sleep 3; #Leave it. Pastebin will ban you if you grab too quick.
            open(FILE, ">Com/$line");
            print FILE $sou;
            close(FILE);
        }
    }
    `rm output; rm codes1`;
}

sub Gather1 {
    `cat output | grep -v -e ads -e settings -e post.php | grep -ao \"\\"/........\\"\" | sed s/\\"\\\\///i | sed s/\\"//g >> codes1`;
}

sub Ca {
    my $link2 = 'http://www.pastebin.ca';
    my $sou2 = get($link2);
    open(FILE, ">output2");
    print FILE $sou2;
    close(FILE);
    Gather2();
    open(FILE, "codes2");
    chomp(my @codes2 = <FILE>);
    close(FILE);
    foreach my $line2 (@codes2) {
        if (-e "Ca/$line2") {
            print "Repeat: $line2. Skipping\n";
        } else {
            my $link2 = "http://www.pastebin.ca/raw/$line2";
            print "Grabbing $line2\n";
            my $sou2 = get($link2) or print "Unable to grab $line2\n" and next;
            open(FILE, ">Ca/$line2");
            print FILE $sou2;
            close(FILE);
        }
    }
    `rm output2; rm codes2`;
}

sub Gather2 {
    `cat output2 | grep -v -e oid.php -e new.php | grep -ao \"\\"/.......\\"\" | sed s/\\"\\\\///i | sed s/\\"//g >> supaaa; cat supaaa | sort | uniq > codes2; rm supaaa;`;
}

sub Git {
    my $paste = 1;
    while($paste < 2130) {
        if (-e "Git/$paste") {
            print "Repeat: $paste. Skipping\n";
            $paste++;
        } else {
            my $link3 = "http://www.gitpaste.com/paste/$paste/raw/";
            print "Grabbing $paste\n";
            my $sou3 = get($link3) or print "Unable to grab $paste\n" and $paste++ and next;
            open(FILE, ">Git/$paste");
            print FILE $sou3;
            close(FILE);
            $paste++;
        }
    }
}

sub Pastie {
    my $link3 = 'http://pastie.org/pastes/';
    my $sou3 = get($link3);
    open(FILE, ">output3");
    print FILE $sou3;
    close(FILE);
    Gather3();
    open(FILE, "codes3");
    chomp(my @codes3 = <FILE>);
    close(FILE);
    foreach my $line3 (@codes3) {
        unless ($line3 =~ /\/y\//) {
            if (-e "Pastie/$line3") {
                print "Repeat: $line3. Skipping\n";
            } else {
                my $link3 = "http://pastie.org/pastes$line3/text";
                print "Grabbing $line3\n";
                my $sou3 = get($link3) or print "Unable to grab $line3\n" and next;
                open(FILE, ">Pastie/$line3");
                print FILE $sou3;
                close(FILE);
            }
        }
    }
    `rm output3; rm codes3`;
}

sub Gather3 {
        `cat output3 | grep -v -i new | grep -ao "pastes/......." | sed s/pastes\//i >> supccc; cat supccc | sort | uniq > codes3; rm supccc`;
}

Comments

Sign in to comment.
Hawkee   -  Apr 25, 2013

Interesting. So you use this to save code you'd like to use?

Sorasyn  -  Apr 26, 2013

It seems a tad too automated for something that intricate. My guess is it'll grab whatever's in reach, regardless of content.

Hawkee  -  Apr 26, 2013

I've noticed a pattern lately of new users posting code or apps, yet they never reply to threads.

Sorasyn  -  Apr 26, 2013

It would seem so. It's a tad perplexing as to why they wouldn't stick around to, at the very least, respond to questions and/or debug their code.

Hawkee  -  Apr 26, 2013

I just makes me wonder if the email notices are going through. Hopefully they aren't being caught by some spam protection.

Sorasyn  -  Apr 26, 2013

They're coming through just fine on my new email account. Although, with enough activity, I can see them getting filtered out.

Ziddykins  -  Apr 30, 2013

I'm here.
It scrapes everything on the front page. Except the ones marked "all". Which I figured the sites have so little pastes, we might as well scrape 'em all. Some pastes aren't listed, so this picks them up. Pastebin will ban you from the site without that sleep in there, so. Anyway, you find some interesting stuff.

Sign in to comment

Are you sure you want to unfollow this person?
Are you sure you want to delete this?
Click "Unsubscribe" to stop receiving notices pertaining to this post.
Click "Subscribe" to resume notices pertaining to this post.