#!/usr/bin/perl use strict; use warnings; use WWW::Curl::Easy; # Script produced by Benjamin Crudo on December 28, 2010. # Feel free to use it and learn from it without me suing you. # Note this function expects a single parameter which should be in the form of a URL # Here is an example of a valid call to this function: # $web_page = &getWebPage("http://www.google.com"); sub getWebPage { # Setting up the Curl parameters my $curl = WWW::Curl::Easy->new; # create a variable to store the curl object # A parameter set to 1 tells the library to include the header in the body output. # This is only relevant for protocols that actually have headers preceding the data (like HTTP). $curl->setopt(CURLOPT_HEADER, 1); # Setting the target URL to retrieve with the passed parameter $curl->setopt(CURLOPT_URL, @_); # Declaring a variable to store the response from the Curl request my $response_body = ''; # Creating a file handle for CURL to output to, then redirecting our output to the $response_body variable open(my $fileb, ">",\$response_body) or die $!; $curl->setopt(CURLOPT_WRITEDATA, $fileb); # getting the return code from the header to see if the GET was successful my $return_code = $curl->perform; # capturing the response code from the GET request in the HTTP header, i.e... 200, 404, 500, etc... # 200 is success my $response_code = $curl->getinfo(CURLINFO_HTTP_CODE); # if the return code is zero than the request was a success if ($return_code == 0) { # A little debug output to keep you informed print ("Success ". $response_code.": ".@_."\n"); # return whatever was contained on the web page that we just got using a GET return $response_body; } else { print ("Failure ". $response_code.": ".@_."\n"); } close($fileb); # close the file-handle }
Tuesday, December 28, 2010
Perl script to GET a web page
Recently I found it necessary to do a little web page scraping and concocted a little Perl script to help me get the job done. The script I've provided below contains a single function that will get the contents of a web page whose URL is passed as the sole parameter to the function. I would likely embed this function within another script to make it easily accessible. I've taken the time to place a lot of comments into the script itself so this post will be short and sweet. You'll need to have the CURL library installed in order for this script to work, and please make sure you are using the most recent version (at the time of writing this post it is 3.12).
Subscribe to:
Post Comments (Atom)
Did you try Perl LWP?
ReplyDeletehttp://search.cpan.org/~gaas/libwww-perl-6.02/lib/LWP/Simple.pm