Jump to content

Using cURL to get data off a website Help Please


savagenoob

Recommended Posts

OK, I have the initial cURL working but need to figure out how to extract data I want off that webpage to display or store in a database, I tried using dom and xpath, but because of the way the page displays using css, i think its not picking it up. Here is my cURL script:

<?php
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';

$target_url = "www.test.com";
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL,$target_url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$html = curl_exec($ch);
if (!$html) {
echo "<br />cURL error number:" .curl_errno($ch);
echo "<br />cURL error:" . curl_error($ch);
exit;
}
// parse the html into a DOMDocument
$dom = new DOMDocument();
$dom->loadHTML($html);

// grab all the on the page
$xpath = new DOMXPath($dom);
$hrefs = $xpath->evaluate("/html/body//td");

for ($i = 0; $i < $hrefs->length; $i++) {
$href = $hrefs->item($i);
$url = $href->getAttribute('href');
storeLink($url,$target_url);
echo "<br />Link stored: $url";
}

?>

and here is a snippet of the source of the page I am getting:

 

<span id="lblTest"><h1 id='surrZipTitle'>Agents in Surrounding Zip Codes</h1><table cellpadding='0' cellspacing='0' border='0' class='tblDent'><tr><td class='tdEliteTitle'><span class='caaSubHead3 addwidth'>H.K. Dent Elite</span></td></tr><tr><td class='tdEliteContent'><table cellpadding='0' cellspacing='0' border='0'><tr><td valign='top'><span class='caaAgencyName2 addwidth'>PROFESSIONAL INS ASSOC, INC.</span></td><td valign='top'> </td></tr></table><table cellpadding='0' cellspacing='0' border='0'><tr><td width='360px' valign='top'><div class='addressBlock'><span>4444 MANZANITA AVE STE 6</span><br /><span>CARMICHAEL                    , CA 95608-1488</span><br /><a class='faaBlueLink' id='lnkContact' href='http://www.safeco.com/portal/server.pt/gateway/PTARGS_0_20656_395_362_0_43/http%3B/por-portlets-prd.int.apps.safeco.com%3B13425/dotcom/FindAnAgent/find-an-agent/contactanagent.aspx?RequestType=agency&level=elite&Id=0415199904150295&lat=38.646142&lng=-121.327623' onclick='oOobj4.Preferences.Plugins.Events.poX=0;'>Contact & Directions</a>    <a class='faaBlueLink' id='lnkWebSite' style='display: none;' href='http://' target='_blank' onclick="return trackEvent('/External-Link/AgentWebsite/                                                                      ','PROFESSIONAL INS ASSOC, INC.                                ');">Website</a></div></td><td valign='top'> </td></tr></table></td></tr></table><table cellpadding='0' cellspacing='0' border='0' class='tblDent'><tr><td class='tdEliteTitle'><span class='caaSubHead3 addwidth'>H.K. Dent Elite</span></td></tr><tr><td class='tdEliteContent'><table cellpadding='0' cellspacing='0' border='0'><tr><td valign='top'><span class='caaAgencyName2 addwidth'>AMERICAN AIM AUTO INS AGY, INC</span></td><td valign='top'> </td></tr></table><table cellpadding='0' cellspacing='0' border='0'><tr><td width='360px' valign='top'><div class='addressBlock'><span>5339 SAN JUAN AVE</span><br /><span>FAIR OAKS                    , CA 95628-3318</span><br /><a class='faaBlueLink' id='lnkContact' href='http://www.safeco.com/portal/server.pt/gateway/PTARGS_0_20656_395_362_0_43/http%3B/por-portlets-prd.int.apps.safeco.com%3B13425/dotcom/FindAnAgent/find-an-agent/contactanagent.aspx?RequestType=agency&level=elite&Id=0415911704151222&lat=38.66237&lng=-121.292429' onclick='oOobj4.Preferences.Plugins.Events.poX=0;'>Contact & Directions</a>

 

So basically I want to extract the agency name like "<span class='caaAgencyName2 addwidth'>PROFESSIONAL INS ASSOC, INC.</span>" and the address which always use the same div class like "caaAgencyName2" and "addressBlock".  How can this be accomplished?

 

Link to comment
Share on other sites

I am trying:

<?php
$userAgent = 'Googlebot/2.1 (http://www.googlebot.com/bot.html)';
header('Content-type: text/xml; charset=utf-8', true);

$target_url = "test.com";
$ch = curl_init();
curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
curl_setopt($ch, CURLOPT_URL,$target_url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
$html = curl_exec($ch);
$html = @mb_convert_encoding($html, 'HTML-ENTITIES', 'utf-8');

curl_close( $ch );
$char  = "<span class=(\"|\'|)caaAgencyName2 addwidth(\"|\'|)>(.*?)</span>";
echo preg_match($char, $html);
?>

but its coming up blank... I think I am close

Link to comment
Share on other sites

do you want ALL matches, or will there only be 1? i assume you want all (preg_match_all), but preg_match should get just the first one.

 

$char  = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/';
$presult = preg_match_all($char, $content, $matches);
print_r($matches);

 

edited to further simplify regular expression

Link to comment
Share on other sites

This is actually working, but I am getting a wierd array...

Array ( [0] => Array ( [0] => PROFESSIONAL INS ASSOC, INC. [1] => AMERICAN AIM AUTO INS AGY, INC [2] => SHEPPARD AND NEILSON INSURANCE SERVICES, INC. [3] => PARK FAMILY INS AND FIN SVCS, IN [4] => PROFESSIONAL INS ASSOC, INC. [5] => WES WRIGHT INS SERVICES PROFESSIONAL INS ASSOC, INC. [6] => SHEPPARD AND NEILSON INSURANCE S [7] => R K JACOBS INSURANCE SERVICE [8] => JOHN C MEYER AGENCY [9] => JOHN C MEYER AGENCY [10] => DIVIDE INSURANCE AGENCY ) [1] => Array ( [0] => ' [1] => ' [2] => ' [3] => ' [4] => ' [5] => ' [6] => ' [7] => ' [8] => ' [9] => ' [10] => ' ) [2] => Array ( [0] => ' [1] => ' [2] => ' [3] => ' [4] => ' [5] => ' [6] => ' [7] => ' [8] => ' [9] => ' [10] => ' ) [3] => Array ( [0] => PROFESSIONAL INS ASSOC, INC. [1] => AMERICAN AIM AUTO INS AGY, INC [2] => SHEPPARD AND NEILSON INSURANCE SERVICES, INC. [3] => PARK FAMILY INS AND FIN SVCS, IN [4] => PROFESSIONAL INS ASSOC, INC. [5] => WES WRIGHT INS SERVICES PROFESSIONAL INS ASSOC, INC. [6] => SHEPPARD AND NEILSON INSURANCE S [7] => R K JACOBS INSURANCE SERVICE [8] => JOHN C MEYER AGENCY [9] => JOHN C MEYER AGENCY [10] => DIVIDE INSURANCE AGENCY ) )

 

and how do I incorporate <div class='addressBlock'><span>5339 SAN JUAN AVE</span><br /><span>FAIR OAKS                    , CA 95628-3318</span>

 

of each result into this regex? Thank you for your help.

Link to comment
Share on other sites

OK, I can sort out the agency names, but still need to modify the regex or run another one to extract the addresses and assign them to the agency name, I tried:

 

$char  = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/';
$presult = preg_match_all($char, $html, $matches);
sort($matches);
foreach( $matches[2] as $key => $value){
echo "Agency Name: $value <br />";
}
$addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><\/br>/';
$addyres = preg_match_all($addressspan, $html, $matchesadd);
print_r($matchesadd);

 

but print_r($matchesadd); is comin up blank...

Link to comment
Share on other sites

OK, this regex works :

 

$char  = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/';
$presult = preg_match_all($char, $html, $matches);

foreach( $matches[0] as $key => $value){
echo "Agency Name: $value <br />";
}
$addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><br \/><a class=/';
$addyres = preg_match_all($addressspan, $html, $matchesadd);
foreach( $matchesadd[3] as $key => $value){
echo "Address: $value <br />";
}

but how do I combine them now... hmmm...

Link to comment
Share on other sites

This worked, thanks.

 

$char  = '/<span class=("|\')caaAgencyName2 addwidth("|\')>(.*?)<\/span>/';
$presult = preg_match_all($char, $html, $matches);
$data1 = $matches[0];

$addressspan = '/<div class=("|\')addressBlock("|\')>(.*?)<\/span><br \/><a class=/';
$addyres = preg_match_all($addressspan, $html, $matchesadd);
$data2 = $matchesadd[3];

$data = array_combine($data1, $data2);

foreach( $data as $key => $value){
?>

<tr>
<td>Agency:</td><td><?php echo $key;?></td><td><?php echo $value; ?></td>
</tr>
<?php
}

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.