I often use web scraping code, of which the below is an example snippet, for looking at technology as part of an IT Assessment, Due Diligence or Review. For this post, I am assuming that the latest stable version of php and curl are installed and working. Below is some generic web scraping code that works well for most web sites.
I generally grab all of the text on the page and then sort through it. Once I have worked out what I want to keep then I discard the raw data.
# $target is set to whatever web page url I am looking to scrape. I can update it so that if it finds Next or page 2 etc then it will cycle around again.
$user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36';
$proxy = "127.0.0.1";
$port = "9050";
$ckfile = tempnam ("/home/pgroom", "targetwebpagecookie.txt");
$ch = curl_init($target);
curl_setopt($ch, CURLOPT_USERAGENT, $user_agent);
curl_setopt($ch, CURLOPT_URL, $target);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
curl_setopt($ch, CURLOPT_FAILONERROR, TRUE);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_MAXREDIRS, 4);
curl_setopt($ch, CURLOPT_COOKIESESSION, TRUE);
curl_setopt($ch, CURLOPT_COOKIEJAR, $ckfile);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
curl_setopt($ch, CURLOPT_ENCODING, "");
# curl_setopt($c, CURLOPT_VERBOSE, TRUE);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, FALSE);
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
# curl_setopt($ch, CURLOPT_PROXYTYPE, 7);
# curl_setopt($ch, CURLOPT_PROXY, $proxy.':'.$port);
$initpage = curl_exec($ch);
$curl_errno = curl_errno($ch);
$curl_error = curl_error($ch);
curl_close($ch);
if ($curl_errno > 0)
{
print "\nCurl error:".$curl_error;
}