Initial Commit
This commit is contained in:
3590
database/perl/vendor/lib/WWW/Mechanize.pm
vendored
Normal file
3590
database/perl/vendor/lib/WWW/Mechanize.pm
vendored
Normal file
File diff suppressed because it is too large
Load Diff
103
database/perl/vendor/lib/WWW/Mechanize/Cookbook.pod
vendored
Normal file
103
database/perl/vendor/lib/WWW/Mechanize/Cookbook.pod
vendored
Normal file
@@ -0,0 +1,103 @@
|
||||
# PODNAME: WWW::Mechanize::Cookbook
|
||||
# ABSTRACT: Recipes for using WWW::Mechanize
|
||||
|
||||
__END__
|
||||
|
||||
=pod
|
||||
|
||||
=encoding UTF-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Mechanize::Cookbook - Recipes for using WWW::Mechanize
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
version 2.03
|
||||
|
||||
=head1 INTRODUCTION
|
||||
|
||||
First, please note that many of these are possible just using
|
||||
L<LWP::UserAgent>. Since C<WWW::Mechanize> is a subclass of
|
||||
L<LWP::UserAgent>, whatever works on C<LWP::UserAgent> should work
|
||||
on C<WWW::Mechanize>. See the L<lwpcook> man page included with
|
||||
the L<libwww-perl> distribution.
|
||||
|
||||
=head1 BASICS
|
||||
|
||||
=head2 Launch the WWW::Mechanize browser
|
||||
|
||||
use WWW::Mechanize;
|
||||
|
||||
my $mech = WWW::Mechanize->new( autocheck => 1 );
|
||||
|
||||
The C<< autocheck => 1 >> tells Mechanize to die if any IO fails,
|
||||
so you don't have to manually check. It's easier that way. If you
|
||||
want to do your own error checking, leave it out.
|
||||
|
||||
=head2 Fetch a page
|
||||
|
||||
$mech->get( "http://search.cpan.org" );
|
||||
print $mech->content;
|
||||
|
||||
C<< $mech->content >> contains the raw HTML from the web page. It
|
||||
is not parsed or handled in any way, at least through the C<content>
|
||||
method.
|
||||
|
||||
=head2 Fetch a page into a file
|
||||
|
||||
Sometimes you want to dump your results directly into a file. For
|
||||
example, there's no reason to read a JPEG into memory if you're
|
||||
only going to write it out immediately. This can also help with
|
||||
memory issues on large files.
|
||||
|
||||
$mech->get( "http://www.cpan.org/src/stable.tar.gz",
|
||||
":content_file" => "stable.tar.gz" );
|
||||
|
||||
=head2 Fetch a password-protected page
|
||||
|
||||
Generally, just call C<credentials> before fetching the page.
|
||||
|
||||
$mech->credentials( 'admin' => 'password' );
|
||||
$mech->get( 'http://10.11.12.13/password.html' );
|
||||
print $mech->content();
|
||||
|
||||
=head1 LINKS
|
||||
|
||||
=head2 Find all image links
|
||||
|
||||
Find all links that point to a JPEG, GIF or PNG.
|
||||
|
||||
my @links = $mech->find_all_links(
|
||||
tag => "a", url_regex => qr/\.(jpe?g|gif|png)$/i );
|
||||
|
||||
=head2 Find all download links
|
||||
|
||||
Find all links that have the word "download" in them.
|
||||
|
||||
my @links = $mech->find_all_links(
|
||||
tag => "a", text_regex => qr/\bdownload\b/i );
|
||||
|
||||
=head1 ADVANCED
|
||||
|
||||
=head2 See what will be sent without actually sending anything
|
||||
|
||||
$mech->add_handler("request_send", sub { shift->dump; exit; });
|
||||
$mech->get("http://www.example.com");
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<WWW::Mechanize>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Andy Lester <andy at petdance.com>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
This software is copyright (c) 2004 by Andy Lester.
|
||||
|
||||
This is free software; you can redistribute it and/or modify it under
|
||||
the same terms as the Perl 5 programming language system itself.
|
||||
|
||||
=cut
|
||||
583
database/perl/vendor/lib/WWW/Mechanize/Examples.pod
vendored
Normal file
583
database/perl/vendor/lib/WWW/Mechanize/Examples.pod
vendored
Normal file
@@ -0,0 +1,583 @@
|
||||
# PODNAME: WWW::Mechanize::Examples
|
||||
# ABSTRACT: Sample programs that use WWW::Mechanize
|
||||
|
||||
__END__
|
||||
|
||||
=pod
|
||||
|
||||
=encoding UTF-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Mechanize::Examples - Sample programs that use WWW::Mechanize
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
version 2.03
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Plenty of people have learned WWW::Mechanize, and now, you can too!
|
||||
|
||||
Following are user-supplied samples of WWW::Mechanize in action.
|
||||
If you have samples you'd like to contribute, please send 'em to
|
||||
C<< <andy@petdance.com> >>.
|
||||
|
||||
You can also look at the F<t/*.t> files in the distribution.
|
||||
|
||||
Please note that these examples are not intended to do any specific task.
|
||||
For all I know, they're no longer functional because the sites they
|
||||
hit have changed. They're here to give examples of how people have
|
||||
used WWW::Mechanize.
|
||||
|
||||
Note that the examples are in reverse order of my having received them,
|
||||
so the freshest examples are always at the top.
|
||||
|
||||
=head2 Starbucks Density Calculator, by Nat Torkington
|
||||
|
||||
Here's a pair of programs from Nat Torkington, editor for O'Reilly Media
|
||||
and co-author of the I<Perl Cookbook>.
|
||||
|
||||
=over 4
|
||||
|
||||
Rael [Dornfest] discovered that you can easily find out how many Starbucks
|
||||
there are in an area by searching for "Starbucks". So I wrote a silly
|
||||
scraper for some old census data and came up with some Starbucks density
|
||||
figures. There's no meaning to these numbers thanks to errors from using
|
||||
old census data coupled with false positives in Yahoo search (e.g.,
|
||||
"Dodie Starbuck-Your Style Desgn" in Portland OR). But it was fun to
|
||||
waste a night on.
|
||||
|
||||
Here are the top twenty cities in descending order of population,
|
||||
with the amount of territory each Starbucks has. E.g., A New York NY
|
||||
Starbucks covers 1.7 square miles of ground.
|
||||
|
||||
New York, NY 1.7
|
||||
Los Angeles, CA 1.2
|
||||
Chicago, IL 1.0
|
||||
Houston, TX 4.6
|
||||
Philadelphia, PA 6.8
|
||||
San Diego, CA 2.7
|
||||
Detroit, MI 19.9
|
||||
Dallas, TX 2.7
|
||||
Phoenix, AZ 4.1
|
||||
San Antonio, TX 12.3
|
||||
San Jose, CA 1.1
|
||||
Baltimore, MD 3.9
|
||||
Indianapolis, IN 12.1
|
||||
San Francisco, CA 0.5
|
||||
Jacksonville, FL 39.9
|
||||
Columbus, OH 7.3
|
||||
Milwaukee, WI 5.1
|
||||
Memphis, TN 15.1
|
||||
Washington, DC 1.4
|
||||
Boston, MA 0.5
|
||||
|
||||
=back
|
||||
|
||||
C<get_pop_data>
|
||||
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use WWW::Mechanize;
|
||||
use Storable;
|
||||
|
||||
$url = 'http://www.census.gov/population/www/documentation/twps0027.html';
|
||||
$m = WWW::Mechanize->new();
|
||||
$m->get($url);
|
||||
|
||||
$c = $m->content;
|
||||
|
||||
$c =~ m{<A NAME=.tabA.>(.*?)</TABLE>}s
|
||||
or die "Can't find the population table\n";
|
||||
$t = $1;
|
||||
@outer = $t =~ m{<TR.*?>(.*?)</TR>}gs;
|
||||
shift @outer;
|
||||
foreach $r (@outer) {
|
||||
@bits = $r =~ m{<TD.*?>(.*?)</TD>}gs;
|
||||
for ($x = 0; $x < @bits; $x++) {
|
||||
$b = $bits[$x];
|
||||
@v = split /\s*<BR>\s*/, $b;
|
||||
foreach (@v) { s/^\s+//; s/\s+$// }
|
||||
push @{$data[$x]}, @v;
|
||||
}
|
||||
}
|
||||
|
||||
for ($y = 0; $y < @{$data[0]}; $y++) {
|
||||
$data{$data[1][$y]} = {
|
||||
NAME => $data[1][$y],
|
||||
RANK => $data[0][$y],
|
||||
POP => comma_free($data[2][$y]),
|
||||
AREA => comma_free($data[3][$y]),
|
||||
DENS => comma_free($data[4][$y]),
|
||||
};
|
||||
}
|
||||
|
||||
store(\%data, "cities.dat");
|
||||
|
||||
sub comma_free {
|
||||
my $n = shift;
|
||||
$n =~ s/,//;
|
||||
return $n;
|
||||
}
|
||||
|
||||
C<plague_of_coffee>
|
||||
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use WWW::Mechanize;
|
||||
use strict;
|
||||
use Storable;
|
||||
|
||||
$SIG{__WARN__} = sub {} ; # ssssssh
|
||||
|
||||
my $Cities = retrieve("cities.dat");
|
||||
|
||||
my $m = WWW::Mechanize->new();
|
||||
$m->get("http://local.yahoo.com/");
|
||||
|
||||
my @cities = sort { $Cities->{$a}{RANK} <=> $Cities->{$b}{RANK} } keys %$Cities;
|
||||
foreach my $c ( @cities ) {
|
||||
my $fields = {
|
||||
'stx' => "starbucks",
|
||||
'csz' => $c,
|
||||
};
|
||||
|
||||
my $r = $m->submit_form(form_number => 2,
|
||||
fields => $fields);
|
||||
die "Couldn't submit form" unless $r->is_success;
|
||||
|
||||
my $hits = number_of_hits($r);
|
||||
# my $ppl = sprintf("%d", 1000 * $Cities->{$c}{POP} / $hits);
|
||||
# print "$c has $hits Starbucks. That's one for every $ppl people.\n";
|
||||
my $density = sprintf("%.1f", $Cities->{$c}{AREA} / $hits);
|
||||
print "$c : $density\n";
|
||||
}
|
||||
|
||||
sub number_of_hits {
|
||||
my $r = shift;
|
||||
my $c = $r->content;
|
||||
if ($c =~ m{\d+ out of <b>(\d+)</b> total results for}) {
|
||||
return $1;
|
||||
}
|
||||
if ($c =~ m{Sorry, no .*? found in or near}) {
|
||||
return 0;
|
||||
}
|
||||
if ($c =~ m{Your search matched multiple cities}) {
|
||||
warn "Your search matched multiple cities\n";
|
||||
return 0;
|
||||
}
|
||||
if ($c =~ m{Sorry we couldn.t find that location}) {
|
||||
warn "No cities\n";
|
||||
return 0;
|
||||
}
|
||||
if ($c =~ m{Could not find.*?, showing results for}) {
|
||||
warn "No matches\n";
|
||||
return 0;
|
||||
}
|
||||
die "Unknown response\n$c\n";
|
||||
}
|
||||
|
||||
=head2 pb-upload, by John Beppu
|
||||
|
||||
This program takes filenames of images from the command line and
|
||||
uploads them to a www.photobucket.com folder. John Beppu, the author, says:
|
||||
|
||||
=over 4
|
||||
|
||||
I had 92 pictures I wanted to upload, and doing it through a browser
|
||||
would've been torture. But thanks to mech, all I had to do was
|
||||
`./pb.upload *.jpg` and watch it do its thing. It felt good.
|
||||
If I had more time, I'd implement WWW::Photobucket on top of
|
||||
WWW::Mechanize.
|
||||
|
||||
=back
|
||||
|
||||
#!/usr/bin/perl -w -T
|
||||
|
||||
use strict;
|
||||
use WWW::Mechanize;
|
||||
|
||||
my $login = "login_name";
|
||||
my $password = "password";
|
||||
my $folder = "folder";
|
||||
|
||||
my $url = "http://img78.photobucket.com/albums/v281/$login/$folder/";
|
||||
|
||||
# login to your photobucket.com account
|
||||
my $mech = WWW::Mechanize->new();
|
||||
$mech->get($url);
|
||||
$mech->submit_form(
|
||||
form_number => 1,
|
||||
fields => { password => $password },
|
||||
);
|
||||
die unless ($mech->success);
|
||||
|
||||
# upload image files specified on command line
|
||||
foreach (@ARGV) {
|
||||
print "$_\n";
|
||||
$mech->form_number(2);
|
||||
$mech->field('the_file[]' => $_);
|
||||
$mech->submit();
|
||||
}
|
||||
|
||||
=head2 listmod, by Ian Langworth
|
||||
|
||||
Ian Langworth contributes this little gem that will bring joy to
|
||||
beleaguered mailing list admins. It discards spam messages through
|
||||
mailman's web interface.
|
||||
|
||||
#!/arch/unix/bin/perl
|
||||
use strict;
|
||||
use warnings;
|
||||
#
|
||||
# listmod - fast alternative to mailman list interface
|
||||
#
|
||||
# usage: listmod crew XXXXXXXX
|
||||
#
|
||||
|
||||
die "usage: $0 <listname> <password>\n" unless @ARGV == 2;
|
||||
my ($listname, $password) = @ARGV;
|
||||
|
||||
use CGI qw(unescape);
|
||||
|
||||
use WWW::Mechanize;
|
||||
my $m = WWW::Mechanize->new( autocheck => 1 );
|
||||
|
||||
use Term::ReadLine;
|
||||
my $term = Term::ReadLine->new($0);
|
||||
|
||||
# submit the form, get the cookie, go to the list admin page
|
||||
$m->get("https://lists.ccs.neu.edu/bin/admindb/$listname");
|
||||
$m->set_visible( $password );
|
||||
$m->click;
|
||||
|
||||
# exit if nothing to do
|
||||
print "There are no pending requests.\n" and exit
|
||||
if $m->content =~ /There are no pending requests/;
|
||||
|
||||
# select the first form and examine its contents
|
||||
$m->form_number(1);
|
||||
my $f = $m->current_form or die "Couldn't get first form!\n";
|
||||
|
||||
# get me the base form element for each email item
|
||||
my @items = map {m/^.+?-(.+)/} grep {m/senderbanp/} $f->param
|
||||
or die "Couldn't get items in first form!\n";
|
||||
|
||||
# iterate through items, prompt user, commit actions
|
||||
foreach my $item (@items) {
|
||||
|
||||
# show item info
|
||||
my $sender = unescape($item);
|
||||
my ($subject) = [$f->find_input("senderbanp-$item")->value_names]->[1]
|
||||
=~ /Subject:\s+(.+?)\s+Size:/g;
|
||||
|
||||
# prompt user
|
||||
my $choice = '';
|
||||
while ( $choice !~ /^[DAX]$/ ) {
|
||||
print "$sender\: '$subject'\n";
|
||||
$choice = uc $term->readline("Action: defer/accept/discard [dax]: ");
|
||||
print "\n\n";
|
||||
}
|
||||
|
||||
# set button
|
||||
$m->field("senderaction-$item" => {D=>0,A=>1,X=>3}->{$choice});
|
||||
}
|
||||
|
||||
# submit actions
|
||||
$m->click;
|
||||
|
||||
=head2 ccdl, by Andy Lester
|
||||
|
||||
Steve McConnell, author of the landmark I<Code Complete> has put
|
||||
up the chapters for the 2nd edition in PDF format on his website.
|
||||
I needed to download them to take to Kinko's to have printed. This
|
||||
little program did it for me.
|
||||
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
use WWW::Mechanize;
|
||||
|
||||
my $start = "http://www.stevemcconnell.com/cc2/cc.htm";
|
||||
|
||||
my $mech = WWW::Mechanize->new( autocheck => 1 );
|
||||
$mech->get( $start );
|
||||
|
||||
my @links = $mech->find_all_links( url_regex => qr/\d+.+\.pdf$/ );
|
||||
|
||||
for my $link ( @links ) {
|
||||
my $url = $link->url_abs;
|
||||
my $filename = $url;
|
||||
$filename =~ s[^.+/][];
|
||||
|
||||
print "Fetching $url";
|
||||
$mech->get( $url, ':content_file' => $filename );
|
||||
|
||||
print " ", -s $filename, " bytes\n";
|
||||
}
|
||||
|
||||
=head2 quotes.pl, by Andy Lester
|
||||
|
||||
This was a program that was going to get a hack in I<Spidering Hacks>,
|
||||
but got cut at the last minute, probably because it's against IMDB's TOS
|
||||
to scrape from it. I present it here as an example, not a suggestion
|
||||
that you break their TOS.
|
||||
|
||||
Last I checked, it didn't work because their HTML didn't match, but it's
|
||||
still good as sample code.
|
||||
|
||||
#!/usr/bin/perl -w
|
||||
|
||||
use strict;
|
||||
|
||||
use WWW::Mechanize;
|
||||
use Getopt::Long;
|
||||
use Text::Wrap;
|
||||
|
||||
my $match = undef;
|
||||
my $random = undef;
|
||||
GetOptions(
|
||||
"match=s" => \$match,
|
||||
"random" => \$random,
|
||||
) or exit 1;
|
||||
|
||||
my $movie = shift @ARGV or die "Must specify a movie\n";
|
||||
|
||||
my $quotes_page = get_quotes_page( $movie );
|
||||
my @quotes = extract_quotes( $quotes_page );
|
||||
|
||||
if ( $match ) {
|
||||
$match = quotemeta($match);
|
||||
@quotes = grep /$match/i, @quotes;
|
||||
}
|
||||
|
||||
if ( $random ) {
|
||||
print $quotes[rand @quotes];
|
||||
}
|
||||
else {
|
||||
print join( "\n", @quotes );
|
||||
}
|
||||
|
||||
|
||||
sub get_quotes_page {
|
||||
my $movie = shift;
|
||||
|
||||
my $mech = WWW::Mechanize->new;
|
||||
$mech->get( "http://www.imdb.com/search" );
|
||||
$mech->success or die "Can't get the search page";
|
||||
|
||||
$mech->submit_form(
|
||||
form_number => 2,
|
||||
fields => {
|
||||
title => $movie,
|
||||
restrict => "Movies only",
|
||||
},
|
||||
);
|
||||
|
||||
my @links = $mech->find_all_links( url_regex => qr[^/Title] )
|
||||
or die "No matches for \"$movie\" were found.\n";
|
||||
|
||||
# Use the first link
|
||||
my ( $url, $title ) = @{$links[0]};
|
||||
|
||||
warn "Checking $title...\n";
|
||||
|
||||
$mech->get( $url );
|
||||
my $link = $mech->find_link( text_regex => qr/Memorable Quotes/i )
|
||||
or die qq{"$title" has no quotes in IMDB!\n};
|
||||
|
||||
warn "Fetching quotes...\n\n";
|
||||
$mech->get( $link->[0] );
|
||||
|
||||
return $mech->content;
|
||||
}
|
||||
|
||||
|
||||
sub extract_quotes {
|
||||
my $page = shift;
|
||||
|
||||
# Nibble away at the unwanted HTML at the beginnning...
|
||||
$page =~ s/.+Memorable Quotes//si;
|
||||
$page =~ s/.+?(<a name)/$1/si;
|
||||
|
||||
# ... and the end of the page
|
||||
$page =~ s/Browse titles in the movie quotes.+$//si;
|
||||
$page =~ s/<p.+$//g;
|
||||
|
||||
# Quotes separated by an <HR> tag
|
||||
my @quotes = split( /<hr.+?>/, $page );
|
||||
|
||||
for my $quote ( @quotes ) {
|
||||
my @lines = split( /<br>/, $quote );
|
||||
for ( @lines ) {
|
||||
s/<[^>]+>//g; # Strip HTML tags
|
||||
s/\s+/ /g; # Squash whitespace
|
||||
s/^ //; # Strip leading space
|
||||
s/ $//; # Strip trailing space
|
||||
s/"/"/g; # Replace HTML entity quotes
|
||||
|
||||
# Word-wrap to fit in 72 columns
|
||||
$Text::Wrap::columns = 72;
|
||||
$_ = wrap( '', ' ', $_ );
|
||||
}
|
||||
$quote = join( "\n", @lines );
|
||||
}
|
||||
|
||||
return @quotes;
|
||||
}
|
||||
|
||||
=head2 cpansearch.pl, by Ed Silva
|
||||
|
||||
A quick little utility to search the CPAN and fire up a browser
|
||||
with a results page.
|
||||
|
||||
#!/usr/bin/perl
|
||||
|
||||
# turn on perl's safety features
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
# work out the name of the module we're looking for
|
||||
my $module_name = $ARGV[0]
|
||||
or die "Must specify module name on command line";
|
||||
|
||||
# create a new browser
|
||||
use WWW::Mechanize;
|
||||
my $browser = WWW::Mechanize->new();
|
||||
|
||||
# tell it to get the main page
|
||||
$browser->get("http://search.cpan.org/");
|
||||
|
||||
# okay, fill in the box with the name of the
|
||||
# module we want to look up
|
||||
$browser->form_number(1);
|
||||
$browser->field("query", $module_name);
|
||||
$browser->click();
|
||||
|
||||
# click on the link that matches the module name
|
||||
$browser->follow_link( text_regex => $module_name );
|
||||
|
||||
my $url = $browser->uri;
|
||||
|
||||
# launch a browser...
|
||||
system('galeon', $url);
|
||||
|
||||
exit(0);
|
||||
|
||||
=head2 lj_friends.cgi, by Matt Cashner
|
||||
|
||||
#!/usr/bin/perl
|
||||
|
||||
# Provides an rss feed of a paid user's LiveJournal friends list
|
||||
# Full entries, protected entries, etc.
|
||||
# Add to your favorite rss reader as
|
||||
# http://your.site.com/cgi-bin/lj_friends.cgi?user=USER&password=PASSWORD
|
||||
|
||||
use warnings;
|
||||
use strict;
|
||||
|
||||
use WWW::Mechanize;
|
||||
use CGI;
|
||||
|
||||
my $cgi = CGI->new();
|
||||
my $form = $cgi->Vars;
|
||||
|
||||
my $agent = WWW::Mechanize->new();
|
||||
|
||||
$agent->get('http://www.livejournal.com/login.bml');
|
||||
$agent->form_number('3');
|
||||
$agent->field('user',$form->{user});
|
||||
$agent->field('password',$form->{password});
|
||||
$agent->submit();
|
||||
$agent->get('http://www.livejournal.com/customview.cgi?user='.$form->{user}.'&styleid=225596&checkcookies=1');
|
||||
print "Content-type: text/plain\n\n";
|
||||
print $agent->content();
|
||||
|
||||
=head2 Hacking Movable Type, by Dan Rinzel
|
||||
|
||||
use strict;
|
||||
use WWW::Mechanize;
|
||||
|
||||
# a tool to automatically post entries to a moveable type weblog, and set arbitrary creation dates
|
||||
|
||||
my $mech = WWW::Mechanize->new();
|
||||
my $entry;
|
||||
$entry->{title} = "Test AutoEntry Title";
|
||||
$entry->{btext} = "Test AutoEntry Body";
|
||||
$entry->{date} = '2002-04-15 14:18:00';
|
||||
my $start = qq|http://my.blog.site/mt.cgi|;
|
||||
|
||||
$mech->get($start);
|
||||
$mech->field('username','und3f1n3d');
|
||||
$mech->field('password','obscur3d');
|
||||
$mech->submit(); # to get login cookie
|
||||
$mech->get(qq|$start?__mode=view&_type=entry&blog_id=1|);
|
||||
$mech->form_name('entry_form');
|
||||
$mech->field('title',$entry->{title});
|
||||
$mech->field('category_id',1); # adjust as needed
|
||||
$mech->field('text',$entry->{btext});
|
||||
$mech->field('status',2); # publish, or 1 = draft
|
||||
$results = $mech->submit();
|
||||
|
||||
# if we're ok with this entry being datestamped "NOW" (no {date} in %entry)
|
||||
# we're done. Otherwise, time to be tricksy
|
||||
# MT returns a 302 redirect from this form. the redirect itself contains a <body onload=""> handler
|
||||
# which takes the user to an editable version of the form where the create date can be edited
|
||||
# MT date format of YYYY-MM-DD HH:MI:SS is the only one that won't error out
|
||||
|
||||
if ($entry->{date} && $entry->{date} =~ /^\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}/) {
|
||||
# travel the redirect
|
||||
$results = $mech->get($results->{_headers}->{location});
|
||||
$results->{_content} =~ /<body onLoad="([^\"]+)"/is;
|
||||
my $js = $1;
|
||||
$js =~ /\'([^']+)\'/;
|
||||
$results = $mech->get($start.$1);
|
||||
$mech->form_name('entry_form');
|
||||
$mech->field('created_on_manual',$entry->{date});
|
||||
$mech->submit();
|
||||
}
|
||||
|
||||
=head2 get-despair, by Randal Schwartz
|
||||
|
||||
Randal submitted this bot that walks the despair.com site sucking down
|
||||
all the pictures.
|
||||
|
||||
use strict;
|
||||
$|++;
|
||||
|
||||
use WWW::Mechanize;
|
||||
use File::Basename;
|
||||
|
||||
my $m = WWW::Mechanize->new;
|
||||
|
||||
$m->get("http://www.despair.com/indem.html");
|
||||
|
||||
my @top_links = @{$m->links};
|
||||
|
||||
for my $top_link_num (0..$#top_links) {
|
||||
next unless $top_links[$top_link_num][0] =~ /^http:/;
|
||||
|
||||
$m->follow_link( n=>$top_link_num ) or die "can't follow $top_link_num";
|
||||
|
||||
print $m->uri, "\n";
|
||||
for my $image (grep m{^http://store4}, map $_->[0], @{$m->links}) {
|
||||
my $local = basename $image;
|
||||
print " $image...", $m->mirror($image, $local)->message, "\n"
|
||||
}
|
||||
|
||||
$m->back or die "can't go back";
|
||||
}
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Andy Lester <andy at petdance.com>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
This software is copyright (c) 2004 by Andy Lester.
|
||||
|
||||
This is free software; you can redistribute it and/or modify it under
|
||||
the same terms as the Perl 5 programming language system itself.
|
||||
|
||||
=cut
|
||||
464
database/perl/vendor/lib/WWW/Mechanize/FAQ.pod
vendored
Normal file
464
database/perl/vendor/lib/WWW/Mechanize/FAQ.pod
vendored
Normal file
@@ -0,0 +1,464 @@
|
||||
# PODNAME: WWW::Mechanize::FAQ
|
||||
# ABSTRACT: Frequently Asked Questions about WWW::Mechanize
|
||||
|
||||
__END__
|
||||
|
||||
=pod
|
||||
|
||||
=encoding UTF-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Mechanize::FAQ - Frequently Asked Questions about WWW::Mechanize
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
version 2.03
|
||||
|
||||
=head1 How to get help with WWW::Mechanize
|
||||
|
||||
If your question isn't answered here in the FAQ, please turn to the
|
||||
communities at:
|
||||
|
||||
=over
|
||||
|
||||
=item * StackOverflow L<https://stackoverflow.com/questions/tagged/www-mechanize>
|
||||
|
||||
=item * #lwp on irc.perl.org
|
||||
|
||||
=item * L<http://perlmonks.org>
|
||||
|
||||
=item * The libwww-perl mailing list at L<http://lists.perl.org>
|
||||
|
||||
=back
|
||||
|
||||
=head1 JavaScript
|
||||
|
||||
=head2 I have this web page that has JavaScript on it, and my Mech program doesn't work.
|
||||
|
||||
That's because WWW::Mechanize doesn't operate on the JavaScript. It only
|
||||
understands the HTML parts of the page.
|
||||
|
||||
=head2 I thought Mech was supposed to work like a web browser.
|
||||
|
||||
It does pretty much, but it doesn't support JavaScript.
|
||||
|
||||
I added some basic attempts at picking up URLs in C<window.open()>
|
||||
calls and return them in C<< $mech->links >>. They work sometimes.
|
||||
|
||||
Since Javascript is completely visible to the client, it cannot be used
|
||||
to prevent a scraper from following links. But it can make life difficult. If
|
||||
you want to scrape specific pages, then a solution is always possible.
|
||||
|
||||
One typical use of Javascript is to perform argument checking before
|
||||
posting to the server. The URL you want is probably just buried in the
|
||||
Javascript function. Do a regular expression match on
|
||||
C<< $mech->content() >>
|
||||
to find the link that you want and C<< $mech->get >> it directly (this
|
||||
assumes that you know what you are looking for in advance).
|
||||
|
||||
In more difficult cases, the Javascript is used for URL mangling to
|
||||
satisfy the needs of some middleware. In this case you need to figure
|
||||
out what the Javascript is doing (why are these URLs always really
|
||||
long?). There is probably some function with one or more arguments which
|
||||
calculates the new URL. Step one: using your favorite browser, get the
|
||||
before and after URLs and save them to files. Edit each file, converting
|
||||
the argument separators ('?', '&' or ';') into newlines. Now it is
|
||||
easy to use diff or comm to find out what Javascript did to the URL.
|
||||
Step 2 - find the function call which created the URL - you will need
|
||||
to parse and interpret its argument list. The Javascript Debugger in the
|
||||
Firebug extension for Firefox helps with the analysis. At this point, it is
|
||||
fairly trivial to write your own function which emulates the Javascript
|
||||
for the pages you want to process.
|
||||
|
||||
Here's another approach that answers the question, "It works in Firefox,
|
||||
but why not Mech?" Everything the web server knows about the client is
|
||||
present in the HTTP request. If two requests are identical, the results
|
||||
should be identical. So the real question is "What is different between
|
||||
the mech request and the Firefox request?"
|
||||
|
||||
The Firefox extension "Tamper Data" is an effective tool for examining
|
||||
the headers of the requests to the server. Compare that with what LWP
|
||||
is sending. Once the two are identical, the action of the server should
|
||||
be the same as well.
|
||||
|
||||
I say "should", because this is an oversimplification - some values
|
||||
are naturally unique, e.g. a SessionID, but if a SessionID is present,
|
||||
that is probably sufficient, even though the value will be different
|
||||
between the LWP request and the Firefox request. The server could use
|
||||
the session to store information which is troublesome, but that's not
|
||||
the first place to look (and highly unlikely to be relevant when you
|
||||
are requesting the login page of your site).
|
||||
|
||||
Generally the problem is to be found in missing or incorrect POSTDATA
|
||||
arguments, Cookies, User-Agents, Accepts, etc. If you are using mech,
|
||||
then redirects and cookies should not be a problem, but are listed here
|
||||
for completeness. If you are missing headers, C<< $mech->add_header >>
|
||||
can be used to add the headers that you need.
|
||||
|
||||
=head2 Which modules work like Mechanize and have JavaScript support?
|
||||
|
||||
In no particular order: L<Gtk2::WebKit::Mechanize>, L<Win32::IE::Mechanize>,
|
||||
L<WWW::Mechanize::Firefox>, L<WWW::Scripter>, L<WWW::Selenium>
|
||||
|
||||
=head1 How do I do X?
|
||||
|
||||
=head2 Can I do [such-and-such] with WWW::Mechanize?
|
||||
|
||||
If it's possible with LWP::UserAgent, then yes. WWW::Mechanize is
|
||||
a subclass of L<LWP::UserAgent>, so all the wondrous magic of that
|
||||
class is inherited.
|
||||
|
||||
=head2 How do I use WWW::Mechanize through a proxy server?
|
||||
|
||||
See the docs in L<LWP::UserAgent> on how to use the proxy. Short version:
|
||||
|
||||
$mech->proxy(['http', 'ftp'], 'http://proxy.example.com:8000/');
|
||||
|
||||
or get the specs from the environment:
|
||||
|
||||
$mech->env_proxy();
|
||||
|
||||
# Environment set like so:
|
||||
gopher_proxy=http://proxy.my.place/
|
||||
wais_proxy=http://proxy.my.place/
|
||||
no_proxy="localhost,my.domain"
|
||||
export gopher_proxy wais_proxy no_proxy
|
||||
|
||||
=head2 How can I see what fields are on the forms?
|
||||
|
||||
Use the mech-dump utility, optionally installed with Mechanize.
|
||||
|
||||
$ mech-dump --forms http://search.cpan.org
|
||||
Dumping forms
|
||||
GET http://search.cpan.org/search
|
||||
query=
|
||||
mode=all (option) [*all|module|dist|author]
|
||||
<NONAME>=CPAN Search (submit)
|
||||
|
||||
=head2 How do I get Mech to handle authentication?
|
||||
|
||||
use MIME::Base64;
|
||||
|
||||
my $agent = WWW::Mechanize->new();
|
||||
my @args = (
|
||||
Authorization => "Basic " .
|
||||
MIME::Base64::encode( USER . ':' . PASS )
|
||||
);
|
||||
|
||||
$agent->credentials( ADDRESS, REALM, USER, PASS );
|
||||
$agent->get( URL, @args );
|
||||
|
||||
If you want to use the credentials for all future requests, you can
|
||||
also use the L<LWP::UserAgent> C<default_header()> method instead
|
||||
of the extra arguments to C<get()>
|
||||
|
||||
$mech->default_header(
|
||||
Authorization => 'Basic ' . encode_base64( USER . ':' . PASSWORD ) );
|
||||
|
||||
=head2 How can I get WWW::Mechanize to execute this JavaScript?
|
||||
|
||||
You can't. JavaScript is entirely client-based, and WWW::Mechanize
|
||||
is a client that doesn't understand JavaScript. See the top part
|
||||
of this FAQ.
|
||||
|
||||
=head2 How do I check a checkbox that doesn't have a value defined?
|
||||
|
||||
Set it to the value of "on".
|
||||
|
||||
$mech->field( my_checkbox => 'on' );
|
||||
|
||||
=head2 How do I handle frames?
|
||||
|
||||
You don't deal with them as frames, per se, but as links. Extract
|
||||
them with
|
||||
|
||||
my @frame_links = $mech->find_link( tag => "frame" );
|
||||
|
||||
=head2 How do I get a list of HTTP headers and their values?
|
||||
|
||||
All L<HTTP::Headers> methods work on a L<HTTP::Response> object which is
|
||||
returned by the I<get()>, I<reload()>, I<response()/res()>, I<click()>,
|
||||
I<submit_form()>, and I<request()> methods.
|
||||
|
||||
my $mech = WWW::Mechanize->new( autocheck => 1 );
|
||||
$mech->get( 'http://my.site.com' );
|
||||
my $response = $mech->response();
|
||||
for my $key ( $response->header_field_names() ) {
|
||||
print $key, " : ", $response->header( $key ), "\n";
|
||||
}
|
||||
|
||||
=head2 How do I enable keep-alive?
|
||||
|
||||
Since L<WWW::Mechanize> is a subclass of L<LWP::UserAgent>, you can
|
||||
use the same mechanism to enable keep-alive:
|
||||
|
||||
use LWP::ConnCache;
|
||||
...
|
||||
$mech->conn_cache(LWP::ConnCache->new);
|
||||
|
||||
=head2 How can I change/specify the action parameter of an HTML form?
|
||||
|
||||
You can access the action of the form by utilizing the L<HTML::Form>
|
||||
object returned from one of the specifying form methods.
|
||||
|
||||
Using C<< $mech->form_number($number) >>:
|
||||
|
||||
my $mech = WWW::mechanize->new;
|
||||
$mech->get('http://someurlhere.com');
|
||||
# Access the form using its Zero-Based Index by DOM order
|
||||
$mech->form_number(0)->action('http://newAction'); #ABS URL
|
||||
|
||||
Using C<< $mech->form_name($number) >>:
|
||||
|
||||
my $mech = WWW::mechanize->new;
|
||||
$mech->get('http://someurlhere.com');
|
||||
#Access the form using its Zero-Based Index by DOM order
|
||||
$mech->form_name('trgForm')->action('http://newAction'); #ABS URL
|
||||
|
||||
=head2 How do I save an image? How do I save a large tarball?
|
||||
|
||||
An image is just content. You get the image and save it.
|
||||
|
||||
$mech->get( 'photo.jpg' );
|
||||
$mech->save_content( '/path/to/my/directory/photo.jpg' );
|
||||
|
||||
You can also save any content directly to disk using the C<:content_file>
|
||||
flag to C<get()>, which is part of L<LWP::UserAgent>.
|
||||
|
||||
$mech->get( 'http://www.cpan.org/src/stable.tar.gz',
|
||||
':content_file' => 'stable.tar.gz' );
|
||||
|
||||
=head2 How do I pick a specific value from a C<< <select> >> list?
|
||||
|
||||
Find the C<HTML::Form::ListInput> in the page.
|
||||
|
||||
my ($listbox) = $mech->find_all_inputs( name => 'listbox' );
|
||||
|
||||
Then create a hash for the lookup:
|
||||
|
||||
my %name_lookup;
|
||||
@name_lookup{ $listbox->value_names } = $listbox->possible_values;
|
||||
my $value = $name_lookup{ 'Name I want' };
|
||||
|
||||
If you have duplicate names, this method won't work, and you'll
|
||||
have to loop over C<< $listbox->value_names >> and
|
||||
C<< $listbox->possible_values >> in parallel until you find a
|
||||
matching name.
|
||||
|
||||
=head2 How do I get Mech to not follow redirects?
|
||||
|
||||
You use functionality in LWP::UserAgent, not Mech itself.
|
||||
|
||||
$mech->requests_redirectable( [] );
|
||||
|
||||
Or you can set C<max_redirect>:
|
||||
|
||||
$mech->max_redirect( 0 );
|
||||
|
||||
Both these options can also be set in the constructor. Mech doesn't
|
||||
understand them, so will pass them through to the LWP::UserAgent
|
||||
constructor.
|
||||
|
||||
=head1 Why doesn't this work: Debugging your Mechanize program
|
||||
|
||||
=head2 My Mech program doesn't work, but it works in the browser.
|
||||
|
||||
Mechanize acts like a browser, but apparently something you're doing
|
||||
is not matching the browser's behavior. Maybe it's expecting a
|
||||
certain web client, or maybe you've not handling a field properly.
|
||||
For some reason, your Mech problem isn't doing exactly what the
|
||||
browser is doing, and when you find that, you'll have the answer.
|
||||
|
||||
=head2 My Mech program gets these 500 errors.
|
||||
|
||||
A 500 error from the web server says that the program on the server
|
||||
side died. Probably the web server program was expecting certain
|
||||
inputs that you didn't supply, and instead of handling it nicely,
|
||||
the program died.
|
||||
|
||||
Whatever the cause of the 500 error, if it works in the browser,
|
||||
but not in your Mech program, you're not acting like the browser.
|
||||
See the previous question.
|
||||
|
||||
=head2 Why doesn't my program handle this form correctly?
|
||||
|
||||
Run F<mech-dump> on your page and see what it says.
|
||||
|
||||
F<mech-dump> is a marvelous diagnostic tool for figuring out what forms
|
||||
and fields are on the page. Say you're scraping CNN.com, you'd get this:
|
||||
|
||||
$ mech-dump http://www.cnn.com/
|
||||
GET http://search.cnn.com/cnn/search
|
||||
source=cnn (hidden readonly)
|
||||
invocationType=search/top (hidden readonly)
|
||||
sites=web (radio) [*web/The Web ??|cnn/CNN.com ??]
|
||||
query= (text)
|
||||
<NONAME>=Search (submit)
|
||||
|
||||
POST http://cgi.money.cnn.com/servlets/quote_redirect
|
||||
query= (text)
|
||||
<NONAME>=GET (submit)
|
||||
|
||||
POST http://polls.cnn.com/poll
|
||||
poll_id=2112 (hidden readonly)
|
||||
question_1=<UNDEF> (radio) [1/Simplistic option|2/VIEW RESULTS]
|
||||
<NONAME>=VOTE (submit)
|
||||
|
||||
GET http://search.cnn.com/cnn/search
|
||||
source=cnn (hidden readonly)
|
||||
invocationType=search/bottom (hidden readonly)
|
||||
sites=web (radio) [*web/??CNN.com|cnn/??]
|
||||
query= (text)
|
||||
<NONAME>=Search (submit)
|
||||
|
||||
Four forms, including the first one duplicated at the end. All the
|
||||
fields, all their defaults, lovingly generated by HTML::Form's C<dump>
|
||||
method.
|
||||
|
||||
If you want to run F<mech-dump> on something that doesn't lend itself
|
||||
to a quick URL fetch, then use the C<save_content()> method to write
|
||||
the HTML to a file, and run F<mech-dump> on the file.
|
||||
|
||||
=head2 Why don't https:// URLs work?
|
||||
|
||||
You need either L<IO::Socket::SSL> or L<Crypt::SSLeay> installed.
|
||||
|
||||
=head2 Why do I get "Input 'fieldname' is readonly"?
|
||||
|
||||
You're trying to change the value of a hidden field and you have
|
||||
warnings on.
|
||||
|
||||
First, make sure that you actually mean to change the field that you're
|
||||
changing, and that you don't have a typo. Usually, hidden variables are
|
||||
set by the site you're working on for a reason. If you change the value,
|
||||
you might be breaking some functionality by faking it out.
|
||||
|
||||
If you really do want to change a hidden value, make the changes in a
|
||||
scope that has warnings turned off:
|
||||
|
||||
{
|
||||
local $^W = 0;
|
||||
$agent->field( name => $value );
|
||||
}
|
||||
|
||||
=head2 I tried to [such-and-such] and I got this weird error.
|
||||
|
||||
Are you checking your errors?
|
||||
|
||||
Are you sure?
|
||||
|
||||
Are you checking that your action succeeded after every action?
|
||||
|
||||
Are you sure?
|
||||
|
||||
For example, if you try this:
|
||||
|
||||
$mech->get( "http://my.site.com" );
|
||||
$mech->follow_link( "foo" );
|
||||
|
||||
and the C<get> call fails for some reason, then the Mech internals
|
||||
will be unusable for the C<follow_link> and you'll get a weird
|
||||
error. You B<must>, after every action that GETs or POSTs a page,
|
||||
check that Mech succeeded, or all bets are off.
|
||||
|
||||
$mech->get( "http://my.site.com" );
|
||||
die "Can't even get the home page: ", $mech->response->status_line
|
||||
unless $mech->success;
|
||||
|
||||
$mech->follow_link( "foo" );
|
||||
die "Foo link failed: ", $mech->response->status_line
|
||||
unless $mech->success;
|
||||
|
||||
=head2 How do I figure out why C<< $mech->get($url) >> doesn't work?
|
||||
|
||||
There are many reasons why a C<< get() >> can fail. The server can take
|
||||
you to someplace you didn't expect. It can generate redirects which are
|
||||
not properly handled. You can get time-outs. Servers are down more often
|
||||
than you think! etc, etc, etc. A couple of places to start:
|
||||
|
||||
=over 4
|
||||
|
||||
=item 1 Check C<< $mech->status() >> after each call
|
||||
|
||||
=item 2 Check the URL with C<< $mech->uri() >> to see where you ended up
|
||||
|
||||
=item 3 Try debugging with C<< LWP::ConsoleLogger >>.
|
||||
|
||||
=back
|
||||
|
||||
If things are really strange, turn on debugging with
|
||||
C<< use LWP::ConsoleLogger::Everywhere; >>
|
||||
Just put this in the main program. This causes LWP to print out a trace
|
||||
of the HTTP traffic between client and server and can be used to figure
|
||||
out what is happening at the protocol level.
|
||||
|
||||
It is also useful to set many traps to verify that processing is
|
||||
proceeding as expected. A Mech program should always have an "I didn't
|
||||
expect to get here" or "I don't recognize the page that I am processing"
|
||||
case and bail out.
|
||||
|
||||
Since errors can be transient, by the time you notice that the error
|
||||
has occurred, it might not be possible to reproduce it manually. So
|
||||
for automated processing it is useful to email yourself the following
|
||||
information:
|
||||
|
||||
=over 4
|
||||
|
||||
=item * where processing is taking place
|
||||
|
||||
=item * An Error Message
|
||||
|
||||
=item * $mech->uri
|
||||
|
||||
=item * $mech->content
|
||||
|
||||
=back
|
||||
|
||||
You can also save the content of the page with C<< $mech->save_content( 'filename.html' ); >>
|
||||
|
||||
=head2 I submitted a form, but the server ignored everything! I got an empty form back!
|
||||
|
||||
The post is handled by application software. It is common for PHP
|
||||
programmers to use the same file both to display a form and to process
|
||||
the arguments returned. So the first task of the application programmer
|
||||
is to decide whether there are arguments to processes. The program can
|
||||
check whether a particular parameter has been set, whether a hidden
|
||||
parameter has been set, or whether the submit button has been clicked.
|
||||
(There are probably other ways that I haven't thought of).
|
||||
|
||||
In any case, if your form is not setting the parameter (e.g. the submit
|
||||
button) which the web application is keying on (and as an outsider there
|
||||
is no way to know what it is keying on), it will not notice that the form
|
||||
has been submitted. Try using C<< $mech->click() >> instead of
|
||||
C<< $mech->submit() >> or vice-versa.
|
||||
|
||||
=head2 I've logged in to the server, but I get 500 errors when I try to get to protected content.
|
||||
|
||||
Some web sites use distributed databases for their processing. It
|
||||
can take a few seconds for the login/session information to percolate
|
||||
through to all the servers. For human users with their slow reaction
|
||||
times, this is not a problem, but a Perl script can outrun the server.
|
||||
So try adding a C<sleep(5)> between logging in and actually doing anything
|
||||
(the optimal delay must be determined experimentally).
|
||||
|
||||
=head2 Mech is a big memory pig! I'm running out of RAM!
|
||||
|
||||
Mech keeps a history of every page, and the state it was in. It actually
|
||||
keeps a clone of the full Mech object at every step along the way.
|
||||
|
||||
You can limit this stack size with the C<stack_depth> param in the C<new()>
|
||||
constructor. If you set stack_size to 0, Mech will not keep any history.
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Andy Lester <andy at petdance.com>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
This software is copyright (c) 2004 by Andy Lester.
|
||||
|
||||
This is free software; you can redistribute it and/or modify it under
|
||||
the same terms as the Perl 5 programming language system itself.
|
||||
|
||||
=cut
|
||||
154
database/perl/vendor/lib/WWW/Mechanize/Image.pm
vendored
Normal file
154
database/perl/vendor/lib/WWW/Mechanize/Image.pm
vendored
Normal file
@@ -0,0 +1,154 @@
|
||||
package WWW::Mechanize::Image;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
our $VERSION = '2.03';
|
||||
|
||||
#ABSTRACT: Image object for WWW::Mechanize
|
||||
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
my $params = shift || {};
|
||||
|
||||
my $self = bless {}, $class;
|
||||
|
||||
for my $param ( qw( url base tag height width alt name attrs ) ) {
|
||||
# Check for what we passed in, not whether it's defined
|
||||
$self->{$param} = $params->{$param} if exists $params->{$param};
|
||||
}
|
||||
|
||||
# url and tag are always required
|
||||
for ( qw( url tag ) ) {
|
||||
exists $self->{$_} or die "WWW::Mechanize::Image->new must have a $_ argument";
|
||||
}
|
||||
|
||||
return $self;
|
||||
}
|
||||
|
||||
|
||||
sub url { return ($_[0])->{url}; }
|
||||
sub base { return ($_[0])->{base}; }
|
||||
sub name { return ($_[0])->{name}; }
|
||||
sub tag { return ($_[0])->{tag}; }
|
||||
sub height { return ($_[0])->{height}; }
|
||||
sub width { return ($_[0])->{width}; }
|
||||
sub alt { return ($_[0])->{alt}; }
|
||||
sub attrs { return ($_[0])->{attrs}; }
|
||||
|
||||
|
||||
sub URI {
|
||||
my $self = shift;
|
||||
|
||||
require URI::URL;
|
||||
my $URI = URI::URL->new( $self->url, $self->base );
|
||||
|
||||
return $URI;
|
||||
}
|
||||
|
||||
|
||||
sub url_abs {
|
||||
my $self = shift;
|
||||
|
||||
return $self->URI->abs;
|
||||
}
|
||||
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=pod
|
||||
|
||||
=encoding UTF-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Mechanize::Image - Image object for WWW::Mechanize
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
version 2.03
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Image object to encapsulate all the stuff that Mech needs
|
||||
|
||||
=head1 Constructor
|
||||
|
||||
=head2 new()
|
||||
|
||||
Creates and returns a new C<WWW::Mechanize::Image> object.
|
||||
|
||||
my $image = WWW::Mechanize::Image->new( {
|
||||
url => $url,
|
||||
base => $base,
|
||||
tag => $tag,
|
||||
name => $name, # From the INPUT tag
|
||||
height => $height, # optional
|
||||
width => $width, # optional
|
||||
alt => $alt, # optional
|
||||
attrs => $attr_ref, # optional
|
||||
} );
|
||||
|
||||
=head1 Accessors
|
||||
|
||||
=head2 $image->url()
|
||||
|
||||
Image URL from the C<src> attribute of the source tag.
|
||||
|
||||
May be C<undef> if source tag has no C<src> attribute.
|
||||
|
||||
=head2 $image->base()
|
||||
|
||||
Base URL to which the links are relative.
|
||||
|
||||
=head2 $image->name()
|
||||
|
||||
Name for the field from the NAME attribute, if any.
|
||||
|
||||
=head2 $image->tag()
|
||||
|
||||
Tag name (either "image" or "input")
|
||||
|
||||
=head2 $image->height()
|
||||
|
||||
Image height
|
||||
|
||||
=head2 $image->width()
|
||||
|
||||
Image width
|
||||
|
||||
=head2 $image->alt()
|
||||
|
||||
ALT attribute from the source tag, if any.
|
||||
|
||||
=head2 $image->attrs()
|
||||
|
||||
Hash ref of all the attributes and attribute values in the tag.
|
||||
|
||||
=head2 $image->URI()
|
||||
|
||||
Returns the URL as a L<URI::URL> object.
|
||||
|
||||
=head2 $image->url_abs()
|
||||
|
||||
Returns the URL as an absolute URL string.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<WWW::Mechanize> and L<WWW::Mechanize::Link>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Andy Lester <andy at petdance.com>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
This software is copyright (c) 2004 by Andy Lester.
|
||||
|
||||
This is free software; you can redistribute it and/or modify it under
|
||||
the same terms as the Perl 5 programming language system itself.
|
||||
|
||||
=cut
|
||||
143
database/perl/vendor/lib/WWW/Mechanize/Link.pm
vendored
Normal file
143
database/perl/vendor/lib/WWW/Mechanize/Link.pm
vendored
Normal file
@@ -0,0 +1,143 @@
|
||||
package WWW::Mechanize::Link;
|
||||
|
||||
use strict;
|
||||
use warnings;
|
||||
|
||||
our $VERSION = '2.03';
|
||||
|
||||
#ABSTRACT: Link object for WWW::Mechanize
|
||||
|
||||
|
||||
sub new {
|
||||
my $class = shift;
|
||||
|
||||
my $self;
|
||||
|
||||
# The order of the first four must stay as they are for
|
||||
# compatibility with older code.
|
||||
if ( ref $_[0] eq 'HASH' ) {
|
||||
$self = [ @{$_[0]}{ qw( url text name tag base attrs ) } ];
|
||||
}
|
||||
else {
|
||||
$self = [ @_ ];
|
||||
}
|
||||
|
||||
return bless $self, $class;
|
||||
}
|
||||
|
||||
|
||||
sub url { return ($_[0])->[0]; }
|
||||
sub text { return ($_[0])->[1]; }
|
||||
sub name { return ($_[0])->[2]; }
|
||||
sub tag { return ($_[0])->[3]; }
|
||||
sub base { return ($_[0])->[4]; }
|
||||
sub attrs { return ($_[0])->[5]; }
|
||||
|
||||
|
||||
sub URI {
|
||||
my $self = shift;
|
||||
|
||||
require URI::URL;
|
||||
my $URI = URI::URL->new( $self->url, $self->base );
|
||||
|
||||
return $URI;
|
||||
}
|
||||
|
||||
|
||||
sub url_abs {
|
||||
my $self = shift;
|
||||
|
||||
return $self->URI->abs;
|
||||
}
|
||||
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
=pod
|
||||
|
||||
=encoding UTF-8
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::Mechanize::Link - Link object for WWW::Mechanize
|
||||
|
||||
=head1 VERSION
|
||||
|
||||
version 2.03
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
Link object to encapsulate all the stuff that Mech needs but nobody
|
||||
wants to deal with as an array.
|
||||
|
||||
=head1 Constructor
|
||||
|
||||
=head2 new()
|
||||
|
||||
my $link = WWW::Mechanize::Link->new( {
|
||||
url => $url,
|
||||
text => $text,
|
||||
name => $name,
|
||||
tag => $tag,
|
||||
base => $base,
|
||||
attr => $attr_href,
|
||||
} );
|
||||
|
||||
For compatibility, this older interface is also supported:
|
||||
|
||||
new( $url, $text, $name, $tag, $base, $attr_href )
|
||||
|
||||
Creates and returns a new C<WWW::Mechanize::Link> object.
|
||||
|
||||
=head1 Accessors
|
||||
|
||||
=head2 $link->url()
|
||||
|
||||
URL from the link
|
||||
|
||||
=head2 $link->text()
|
||||
|
||||
Text of the link
|
||||
|
||||
=head2 $link->name()
|
||||
|
||||
NAME attribute from the source tag, if any.
|
||||
|
||||
=head2 $link->tag()
|
||||
|
||||
Tag name (one of: "a", "area", "frame", "iframe" or "meta").
|
||||
|
||||
=head2 $link->base()
|
||||
|
||||
Base URL to which the links are relative.
|
||||
|
||||
=head2 $link->attrs()
|
||||
|
||||
Returns hash ref of all the attributes and attribute values in the tag.
|
||||
|
||||
=head2 $link->URI()
|
||||
|
||||
Returns the URL as a L<URI::URL> object.
|
||||
|
||||
=head2 $link->url_abs()
|
||||
|
||||
Returns a L<URI::URL> object for the absolute form of the string.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<WWW::Mechanize> and L<WWW::Mechanize::Image>
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
Andy Lester <andy at petdance.com>
|
||||
|
||||
=head1 COPYRIGHT AND LICENSE
|
||||
|
||||
This software is copyright (c) 2004 by Andy Lester.
|
||||
|
||||
This is free software; you can redistribute it and/or modify it under
|
||||
the same terms as the Perl 5 programming language system itself.
|
||||
|
||||
=cut
|
||||
453
database/perl/vendor/lib/WWW/RobotRules.pm
vendored
Normal file
453
database/perl/vendor/lib/WWW/RobotRules.pm
vendored
Normal file
@@ -0,0 +1,453 @@
|
||||
package WWW::RobotRules;
|
||||
|
||||
$VERSION = "6.02";
|
||||
sub Version { $VERSION; }
|
||||
|
||||
use strict;
|
||||
use URI ();
|
||||
|
||||
|
||||
|
||||
sub new {
|
||||
my($class, $ua) = @_;
|
||||
|
||||
# This ugly hack is needed to ensure backwards compatibility.
|
||||
# The "WWW::RobotRules" class is now really abstract.
|
||||
$class = "WWW::RobotRules::InCore" if $class eq "WWW::RobotRules";
|
||||
|
||||
my $self = bless { }, $class;
|
||||
$self->agent($ua);
|
||||
$self;
|
||||
}
|
||||
|
||||
|
||||
sub parse {
|
||||
my($self, $robot_txt_uri, $txt, $fresh_until) = @_;
|
||||
$robot_txt_uri = URI->new("$robot_txt_uri");
|
||||
my $netloc = $robot_txt_uri->host . ":" . $robot_txt_uri->port;
|
||||
|
||||
$self->clear_rules($netloc);
|
||||
$self->fresh_until($netloc, $fresh_until || (time + 365*24*3600));
|
||||
|
||||
my $ua;
|
||||
my $is_me = 0; # 1 iff this record is for me
|
||||
my $is_anon = 0; # 1 iff this record is for *
|
||||
my $seen_disallow = 0; # watch for missing record separators
|
||||
my @me_disallowed = (); # rules disallowed for me
|
||||
my @anon_disallowed = (); # rules disallowed for *
|
||||
|
||||
# blank lines are significant, so turn CRLF into LF to avoid generating
|
||||
# false ones
|
||||
$txt =~ s/\015\012/\012/g;
|
||||
|
||||
# split at \012 (LF) or \015 (CR) (Mac text files have just CR for EOL)
|
||||
for(split(/[\012\015]/, $txt)) {
|
||||
|
||||
# Lines containing only a comment are discarded completely, and
|
||||
# therefore do not indicate a record boundary.
|
||||
next if /^\s*\#/;
|
||||
|
||||
s/\s*\#.*//; # remove comments at end-of-line
|
||||
|
||||
if (/^\s*$/) { # blank line
|
||||
last if $is_me; # That was our record. No need to read the rest.
|
||||
$is_anon = 0;
|
||||
$seen_disallow = 0;
|
||||
}
|
||||
elsif (/^\s*User-Agent\s*:\s*(.*)/i) {
|
||||
$ua = $1;
|
||||
$ua =~ s/\s+$//;
|
||||
|
||||
if ($seen_disallow) {
|
||||
# treat as start of a new record
|
||||
$seen_disallow = 0;
|
||||
last if $is_me; # That was our record. No need to read the rest.
|
||||
$is_anon = 0;
|
||||
}
|
||||
|
||||
if ($is_me) {
|
||||
# This record already had a User-agent that
|
||||
# we matched, so just continue.
|
||||
}
|
||||
elsif ($ua eq '*') {
|
||||
$is_anon = 1;
|
||||
}
|
||||
elsif($self->is_me($ua)) {
|
||||
$is_me = 1;
|
||||
}
|
||||
}
|
||||
elsif (/^\s*Disallow\s*:\s*(.*)/i) {
|
||||
unless (defined $ua) {
|
||||
warn "RobotRules <$robot_txt_uri>: Disallow without preceding User-agent\n" if $^W;
|
||||
$is_anon = 1; # assume that User-agent: * was intended
|
||||
}
|
||||
my $disallow = $1;
|
||||
$disallow =~ s/\s+$//;
|
||||
$seen_disallow = 1;
|
||||
if (length $disallow) {
|
||||
my $ignore;
|
||||
eval {
|
||||
my $u = URI->new_abs($disallow, $robot_txt_uri);
|
||||
$ignore++ if $u->scheme ne $robot_txt_uri->scheme;
|
||||
$ignore++ if lc($u->host) ne lc($robot_txt_uri->host);
|
||||
$ignore++ if $u->port ne $robot_txt_uri->port;
|
||||
$disallow = $u->path_query;
|
||||
$disallow = "/" unless length $disallow;
|
||||
};
|
||||
next if $@;
|
||||
next if $ignore;
|
||||
}
|
||||
|
||||
if ($is_me) {
|
||||
push(@me_disallowed, $disallow);
|
||||
}
|
||||
elsif ($is_anon) {
|
||||
push(@anon_disallowed, $disallow);
|
||||
}
|
||||
}
|
||||
elsif (/\S\s*:/) {
|
||||
# ignore
|
||||
}
|
||||
else {
|
||||
warn "RobotRules <$robot_txt_uri>: Malformed record: <$_>\n" if $^W;
|
||||
}
|
||||
}
|
||||
|
||||
if ($is_me) {
|
||||
$self->push_rules($netloc, @me_disallowed);
|
||||
}
|
||||
else {
|
||||
$self->push_rules($netloc, @anon_disallowed);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#
|
||||
# Returns TRUE if the given name matches the
|
||||
# name of this robot
|
||||
#
|
||||
sub is_me {
|
||||
my($self, $ua_line) = @_;
|
||||
my $me = $self->agent;
|
||||
|
||||
# See whether my short-name is a substring of the
|
||||
# "User-Agent: ..." line that we were passed:
|
||||
|
||||
if(index(lc($me), lc($ua_line)) >= 0) {
|
||||
return 1;
|
||||
}
|
||||
else {
|
||||
return '';
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub allowed {
|
||||
my($self, $uri) = @_;
|
||||
$uri = URI->new("$uri");
|
||||
|
||||
return 1 unless $uri->scheme eq 'http' or $uri->scheme eq 'https';
|
||||
# Robots.txt applies to only those schemes.
|
||||
|
||||
my $netloc = $uri->host . ":" . $uri->port;
|
||||
|
||||
my $fresh_until = $self->fresh_until($netloc);
|
||||
return -1 if !defined($fresh_until) || $fresh_until < time;
|
||||
|
||||
my $str = $uri->path_query;
|
||||
my $rule;
|
||||
for $rule ($self->rules($netloc)) {
|
||||
return 1 unless length $rule;
|
||||
return 0 if index($str, $rule) == 0;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
|
||||
# The following methods must be provided by the subclass.
|
||||
sub agent;
|
||||
sub visit;
|
||||
sub no_visits;
|
||||
sub last_visits;
|
||||
sub fresh_until;
|
||||
sub push_rules;
|
||||
sub clear_rules;
|
||||
sub rules;
|
||||
sub dump;
|
||||
|
||||
|
||||
|
||||
package WWW::RobotRules::InCore;
|
||||
|
||||
use vars qw(@ISA);
|
||||
@ISA = qw(WWW::RobotRules);
|
||||
|
||||
|
||||
|
||||
sub agent {
|
||||
my ($self, $name) = @_;
|
||||
my $old = $self->{'ua'};
|
||||
if ($name) {
|
||||
# Strip it so that it's just the short name.
|
||||
# I.e., "FooBot" => "FooBot"
|
||||
# "FooBot/1.2" => "FooBot"
|
||||
# "FooBot/1.2 [http://foobot.int; foo@bot.int]" => "FooBot"
|
||||
|
||||
$name = $1 if $name =~ m/(\S+)/; # get first word
|
||||
$name =~ s!/.*!!; # get rid of version
|
||||
unless ($old && $old eq $name) {
|
||||
delete $self->{'loc'}; # all old info is now stale
|
||||
$self->{'ua'} = $name;
|
||||
}
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub visit {
|
||||
my($self, $netloc, $time) = @_;
|
||||
return unless $netloc;
|
||||
$time ||= time;
|
||||
$self->{'loc'}{$netloc}{'last'} = $time;
|
||||
my $count = \$self->{'loc'}{$netloc}{'count'};
|
||||
if (!defined $$count) {
|
||||
$$count = 1;
|
||||
}
|
||||
else {
|
||||
$$count++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub no_visits {
|
||||
my ($self, $netloc) = @_;
|
||||
$self->{'loc'}{$netloc}{'count'};
|
||||
}
|
||||
|
||||
|
||||
sub last_visit {
|
||||
my ($self, $netloc) = @_;
|
||||
$self->{'loc'}{$netloc}{'last'};
|
||||
}
|
||||
|
||||
|
||||
sub fresh_until {
|
||||
my ($self, $netloc, $fresh_until) = @_;
|
||||
my $old = $self->{'loc'}{$netloc}{'fresh'};
|
||||
if (defined $fresh_until) {
|
||||
$self->{'loc'}{$netloc}{'fresh'} = $fresh_until;
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
|
||||
sub push_rules {
|
||||
my($self, $netloc, @rules) = @_;
|
||||
push (@{$self->{'loc'}{$netloc}{'rules'}}, @rules);
|
||||
}
|
||||
|
||||
|
||||
sub clear_rules {
|
||||
my($self, $netloc) = @_;
|
||||
delete $self->{'loc'}{$netloc}{'rules'};
|
||||
}
|
||||
|
||||
|
||||
sub rules {
|
||||
my($self, $netloc) = @_;
|
||||
if (defined $self->{'loc'}{$netloc}{'rules'}) {
|
||||
return @{$self->{'loc'}{$netloc}{'rules'}};
|
||||
}
|
||||
else {
|
||||
return ();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
sub dump
|
||||
{
|
||||
my $self = shift;
|
||||
for (keys %$self) {
|
||||
next if $_ eq 'loc';
|
||||
print "$_ = $self->{$_}\n";
|
||||
}
|
||||
for (keys %{$self->{'loc'}}) {
|
||||
my @rules = $self->rules($_);
|
||||
print "$_: ", join("; ", @rules), "\n";
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
1;
|
||||
|
||||
__END__
|
||||
|
||||
|
||||
# Bender: "Well, I don't have anything else
|
||||
# planned for today. Let's get drunk!"
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::RobotRules - database of robots.txt-derived permissions
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
use WWW::RobotRules;
|
||||
my $rules = WWW::RobotRules->new('MOMspider/1.0');
|
||||
|
||||
use LWP::Simple qw(get);
|
||||
|
||||
{
|
||||
my $url = "http://some.place/robots.txt";
|
||||
my $robots_txt = get $url;
|
||||
$rules->parse($url, $robots_txt) if defined $robots_txt;
|
||||
}
|
||||
|
||||
{
|
||||
my $url = "http://some.other.place/robots.txt";
|
||||
my $robots_txt = get $url;
|
||||
$rules->parse($url, $robots_txt) if defined $robots_txt;
|
||||
}
|
||||
|
||||
# Now we can check if a URL is valid for those servers
|
||||
# whose "robots.txt" files we've gotten and parsed:
|
||||
if($rules->allowed($url)) {
|
||||
$c = get $url;
|
||||
...
|
||||
}
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This module parses F</robots.txt> files as specified in
|
||||
"A Standard for Robot Exclusion", at
|
||||
<http://www.robotstxt.org/wc/norobots.html>
|
||||
Webmasters can use the F</robots.txt> file to forbid conforming
|
||||
robots from accessing parts of their web site.
|
||||
|
||||
The parsed files are kept in a WWW::RobotRules object, and this object
|
||||
provides methods to check if access to a given URL is prohibited. The
|
||||
same WWW::RobotRules object can be used for one or more parsed
|
||||
F</robots.txt> files on any number of hosts.
|
||||
|
||||
The following methods are provided:
|
||||
|
||||
=over 4
|
||||
|
||||
=item $rules = WWW::RobotRules->new($robot_name)
|
||||
|
||||
This is the constructor for WWW::RobotRules objects. The first
|
||||
argument given to new() is the name of the robot.
|
||||
|
||||
=item $rules->parse($robot_txt_url, $content, $fresh_until)
|
||||
|
||||
The parse() method takes as arguments the URL that was used to
|
||||
retrieve the F</robots.txt> file, and the contents of the file.
|
||||
|
||||
=item $rules->allowed($uri)
|
||||
|
||||
Returns TRUE if this robot is allowed to retrieve this URL.
|
||||
|
||||
=item $rules->agent([$name])
|
||||
|
||||
Get/set the agent name. NOTE: Changing the agent name will clear the robots.txt
|
||||
rules and expire times out of the cache.
|
||||
|
||||
=back
|
||||
|
||||
=head1 ROBOTS.TXT
|
||||
|
||||
The format and semantics of the "/robots.txt" file are as follows
|
||||
(this is an edited abstract of
|
||||
<http://www.robotstxt.org/wc/norobots.html>):
|
||||
|
||||
The file consists of one or more records separated by one or more
|
||||
blank lines. Each record contains lines of the form
|
||||
|
||||
<field-name>: <value>
|
||||
|
||||
The field name is case insensitive. Text after the '#' character on a
|
||||
line is ignored during parsing. This is used for comments. The
|
||||
following <field-names> can be used:
|
||||
|
||||
=over 3
|
||||
|
||||
=item User-Agent
|
||||
|
||||
The value of this field is the name of the robot the record is
|
||||
describing access policy for. If more than one I<User-Agent> field is
|
||||
present the record describes an identical access policy for more than
|
||||
one robot. At least one field needs to be present per record. If the
|
||||
value is '*', the record describes the default access policy for any
|
||||
robot that has not not matched any of the other records.
|
||||
|
||||
The I<User-Agent> fields must occur before the I<Disallow> fields. If a
|
||||
record contains a I<User-Agent> field after a I<Disallow> field, that
|
||||
constitutes a malformed record. This parser will assume that a blank
|
||||
line should have been placed before that I<User-Agent> field, and will
|
||||
break the record into two. All the fields before the I<User-Agent> field
|
||||
will constitute a record, and the I<User-Agent> field will be the first
|
||||
field in a new record.
|
||||
|
||||
=item Disallow
|
||||
|
||||
The value of this field specifies a partial URL that is not to be
|
||||
visited. This can be a full path, or a partial path; any URL that
|
||||
starts with this value will not be retrieved
|
||||
|
||||
=back
|
||||
|
||||
Unrecognized records are ignored.
|
||||
|
||||
=head1 ROBOTS.TXT EXAMPLES
|
||||
|
||||
The following example "/robots.txt" file specifies that no robots
|
||||
should visit any URL starting with "/cyberworld/map/" or "/tmp/":
|
||||
|
||||
User-agent: *
|
||||
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
||||
Disallow: /tmp/ # these will soon disappear
|
||||
|
||||
This example "/robots.txt" file specifies that no robots should visit
|
||||
any URL starting with "/cyberworld/map/", except the robot called
|
||||
"cybermapper":
|
||||
|
||||
User-agent: *
|
||||
Disallow: /cyberworld/map/ # This is an infinite virtual URL space
|
||||
|
||||
# Cybermapper knows where to go.
|
||||
User-agent: cybermapper
|
||||
Disallow:
|
||||
|
||||
This example indicates that no robots should visit this site further:
|
||||
|
||||
# go away
|
||||
User-agent: *
|
||||
Disallow: /
|
||||
|
||||
This is an example of a malformed robots.txt file.
|
||||
|
||||
# robots.txt for ancientcastle.example.com
|
||||
# I've locked myself away.
|
||||
User-agent: *
|
||||
Disallow: /
|
||||
# The castle is your home now, so you can go anywhere you like.
|
||||
User-agent: Belle
|
||||
Disallow: /west-wing/ # except the west wing!
|
||||
# It's good to be the Prince...
|
||||
User-agent: Beast
|
||||
Disallow:
|
||||
|
||||
This file is missing the required blank lines between records.
|
||||
However, the intention is clear.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<LWP::RobotUA>, L<WWW::RobotRules::AnyDBM_File>
|
||||
|
||||
=head1 COPYRIGHT
|
||||
|
||||
Copyright 1995-2009, Gisle Aas
|
||||
Copyright 1995, Martijn Koster
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the same terms as Perl itself.
|
||||
170
database/perl/vendor/lib/WWW/RobotRules/AnyDBM_File.pm
vendored
Normal file
170
database/perl/vendor/lib/WWW/RobotRules/AnyDBM_File.pm
vendored
Normal file
@@ -0,0 +1,170 @@
|
||||
package WWW::RobotRules::AnyDBM_File;
|
||||
|
||||
require WWW::RobotRules;
|
||||
@ISA = qw(WWW::RobotRules);
|
||||
$VERSION = "6.00";
|
||||
|
||||
use Carp ();
|
||||
use AnyDBM_File;
|
||||
use Fcntl;
|
||||
use strict;
|
||||
|
||||
=head1 NAME
|
||||
|
||||
WWW::RobotRules::AnyDBM_File - Persistent RobotRules
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
require WWW::RobotRules::AnyDBM_File;
|
||||
require LWP::RobotUA;
|
||||
|
||||
# Create a robot useragent that uses a diskcaching RobotRules
|
||||
my $rules = WWW::RobotRules::AnyDBM_File->new( 'my-robot/1.0', 'cachefile' );
|
||||
my $ua = WWW::RobotUA->new( 'my-robot/1.0', 'me@foo.com', $rules );
|
||||
|
||||
# Then just use $ua as usual
|
||||
$res = $ua->request($req);
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
This is a subclass of I<WWW::RobotRules> that uses the AnyDBM_File
|
||||
package to implement persistent diskcaching of F<robots.txt> and host
|
||||
visit information.
|
||||
|
||||
The constructor (the new() method) takes an extra argument specifying
|
||||
the name of the DBM file to use. If the DBM file already exists, then
|
||||
you can specify undef as agent name as the name can be obtained from
|
||||
the DBM database.
|
||||
|
||||
=cut
|
||||
|
||||
sub new
|
||||
{
|
||||
my ($class, $ua, $file) = @_;
|
||||
Carp::croak('WWW::RobotRules::AnyDBM_File filename required') unless $file;
|
||||
|
||||
my $self = bless { }, $class;
|
||||
$self->{'filename'} = $file;
|
||||
tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_CREAT|O_RDWR, 0640
|
||||
or Carp::croak("Can't open $file: $!");
|
||||
|
||||
if ($ua) {
|
||||
$self->agent($ua);
|
||||
}
|
||||
else {
|
||||
# Try to obtain name from DBM file
|
||||
$ua = $self->{'dbm'}{"|ua-name|"};
|
||||
Carp::croak("No agent name specified") unless $ua;
|
||||
}
|
||||
|
||||
$self;
|
||||
}
|
||||
|
||||
sub agent {
|
||||
my($self, $newname) = @_;
|
||||
my $old = $self->{'dbm'}{"|ua-name|"};
|
||||
if (defined $newname) {
|
||||
$newname =~ s!/?\s*\d+.\d+\s*$!!; # loose version
|
||||
unless ($old && $old eq $newname) {
|
||||
# Old info is now stale.
|
||||
my $file = $self->{'filename'};
|
||||
untie %{$self->{'dbm'}};
|
||||
tie %{$self->{'dbm'}}, 'AnyDBM_File', $file, O_TRUNC|O_RDWR, 0640;
|
||||
%{$self->{'dbm'}} = ();
|
||||
$self->{'dbm'}{"|ua-name|"} = $newname;
|
||||
}
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
sub no_visits {
|
||||
my ($self, $netloc) = @_;
|
||||
my $t = $self->{'dbm'}{"$netloc|vis"};
|
||||
return 0 unless $t;
|
||||
(split(/;\s*/, $t))[0];
|
||||
}
|
||||
|
||||
sub last_visit {
|
||||
my ($self, $netloc) = @_;
|
||||
my $t = $self->{'dbm'}{"$netloc|vis"};
|
||||
return undef unless $t;
|
||||
(split(/;\s*/, $t))[1];
|
||||
}
|
||||
|
||||
sub fresh_until {
|
||||
my ($self, $netloc, $fresh) = @_;
|
||||
my $old = $self->{'dbm'}{"$netloc|exp"};
|
||||
if ($old) {
|
||||
$old =~ s/;.*//; # remove cleartext
|
||||
}
|
||||
if (defined $fresh) {
|
||||
$fresh .= "; " . localtime($fresh);
|
||||
$self->{'dbm'}{"$netloc|exp"} = $fresh;
|
||||
}
|
||||
$old;
|
||||
}
|
||||
|
||||
sub visit {
|
||||
my($self, $netloc, $time) = @_;
|
||||
$time ||= time;
|
||||
|
||||
my $count = 0;
|
||||
my $old = $self->{'dbm'}{"$netloc|vis"};
|
||||
if ($old) {
|
||||
my $last;
|
||||
($count,$last) = split(/;\s*/, $old);
|
||||
$time = $last if $last > $time;
|
||||
}
|
||||
$count++;
|
||||
$self->{'dbm'}{"$netloc|vis"} = "$count; $time; " . localtime($time);
|
||||
}
|
||||
|
||||
sub push_rules {
|
||||
my($self, $netloc, @rules) = @_;
|
||||
my $cnt = 1;
|
||||
$cnt++ while $self->{'dbm'}{"$netloc|r$cnt"};
|
||||
|
||||
foreach (@rules) {
|
||||
$self->{'dbm'}{"$netloc|r$cnt"} = $_;
|
||||
$cnt++;
|
||||
}
|
||||
}
|
||||
|
||||
sub clear_rules {
|
||||
my($self, $netloc) = @_;
|
||||
my $cnt = 1;
|
||||
while ($self->{'dbm'}{"$netloc|r$cnt"}) {
|
||||
delete $self->{'dbm'}{"$netloc|r$cnt"};
|
||||
$cnt++;
|
||||
}
|
||||
}
|
||||
|
||||
sub rules {
|
||||
my($self, $netloc) = @_;
|
||||
my @rules = ();
|
||||
my $cnt = 1;
|
||||
while (1) {
|
||||
my $rule = $self->{'dbm'}{"$netloc|r$cnt"};
|
||||
last unless $rule;
|
||||
push(@rules, $rule);
|
||||
$cnt++;
|
||||
}
|
||||
@rules;
|
||||
}
|
||||
|
||||
sub dump
|
||||
{
|
||||
}
|
||||
|
||||
1;
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
L<WWW::RobotRules>, L<LWP::RobotUA>
|
||||
|
||||
=head1 AUTHORS
|
||||
|
||||
Hakan Ardo E<lt>hakan@munin.ub2.lu.se>, Gisle Aas E<lt>aas@sn.no>
|
||||
|
||||
=cut
|
||||
|
||||
Reference in New Issue
Block a user