Jump to content

HTML DOM Parser


Help!php

Recommended Posts

Html Dom parser gets price and few other information from a different website and adds this to my database. In my database there is URL row which is used to look at their website and find the information for each product. All the new ones should get the URL and save this to the database and add the price but this is not working.

 

It works with the URL that is inside my database but the new ones arent detected but it should. My code is shown below. Please help

 

require_once 'simplehtmldom/simple_html_dom.php';

$_ECHO = FALSE;
$outputFileName = "sitemap.txt";

set_time_limit( 0 );

echo "MYSQL: Connecting to DB...<br />";

// Connect to DB
$con = mysql_connect( "localhost", "root", "" );

// Select DB
mysql_select_db( "db", $con );

// Get exisiting list of PL product URLs
$qry = "SELECT * FROM SITE WHERE NOT ISNULL(productid) AND price= '0.00' ORDER BY productid ASC";
$result = mysql_query ( $qry, $con );
$html = new simple_html_dom();

echo "START updating PLSITEMAP DB...<br />";

while ( $resultArray = mysql_fetch_assoc( $result ) )
{

//if ( $_ECHO ) echo "{$resultArray[ 'url' ]}<br />";
//if ( $_ECHO ) echo "{$resultArray[ 'rrp' ]}<br />";
//if ( $_ECHO ) echo "{$resultArray[ 'productid' ]}<br />";
//exit(0);
	$url = $resultArray[ 'url' ];
	$fileContents = file_get_contents( $url );
	$html->load( $fileContents );

	$stop = FALSE;
	$type = "";
	$metas = $html->find( "meta[name=Keywords]" );

	if ( isset( $metas[ 0 ] ) )
	{
//echo $metas[ 0 ]->content;
//exit(0);
		$stop = 	strstr( strtoupper( $metas[ 0 ]->content ), "EXDEMO" ) || 
					strstr( strtoupper( $metas[ 0 ]->content ), "BOXOPEN" )|| 
					strstr( strtoupper( $metas[ 0 ]->content ), "BOX OPEN" )|| 
					strstr( strtoupper( $metas[ 0 ]->content ), "DISCONTINUED" );

		$type = ( strstr( strtoupper( $metas[ 0 ]->content ), "PRINTER" ) == FALSE ? "c" : "p" );
		//$stop = $stop ? $stop : ( strstr( strtoupper( $metas[ 0 ]->content ), "PRINTER" ) == FALSE );
	}
	else
	{
		if ( $_ECHO ) echo "<meta> tag NOT FOUND<br />";
		$stop = TRUE;
	}

	if ( !$stop )
	{	
		$pid = $html->find( "#ctl00_placeholderMain_lblItem" );

		// See if its in the page
		if ( isset( $pid[ 0 ] ) )
		{
			$pn = strip_tags( $pid[ 0 ] );
			//$pn = substr( $pn, strpos( $pn, ";" ) + 1 );
			$pn = strpos($pn, ';') !== FALSE ? substr( $pn, strpos( $pn, ";" ) + 1 ) : $pn;  
			$pn = str_replace( "/", "_", $pn );
			$pn = trim( $pn );
		}
		else
		{
			// Didnt find product id so no point going any further
			$pn = "0";
		}

		if ( $_ECHO ) echo "$pn<br />";

		// Look for the price 
		$rrp = $html->find( "#ctl00_placeholderMain_lbltxtProductPrice" );

		if ( isset( $rrp[ 0 ] ) )
		{
			// Tidy it up - remove commas and weird Word chars
			$price = str_replace("Â", "", strip_tags( $rrp[0] ) );
			$price = substr( $price, strpos( $price, ";" ) + 1 );
			$price = str_replace(",", "", $price );
			$price = trim( $price );
		}
		else
		{
			$price = "0.00";
		}
		if ( $_ECHO ) echo "$price<br />";


		$qry = "UPDATE SITE SET price='$price',  type='$type' WHERE url='$url'";

		if ( $_ECHO ) echo "$qry<br />";

		mysql_query ( $qry, $con );
	}
}

if ( $_ECHO ) echo "DONE updating PLSITEMAP DB<br />";
exit(0);

if($_ECHO) echo "MYSQL: Deleting existing table...<br />";	

// Delete any existing table data
$qry = "DROP TABLE IF EXISTS SITE";
mysql_query ( $qry, $con );

if($_ECHO) echo "MYSQL: Creating new table...<br />";

// Create new one
$qry = "CREATE TABLE SITE ( productid varchar(30), price decimal(6,2), url varchar( 1024 ) )";

// Create the table
mysql_query ( $qry, $con );

$numSitemapPages = 350;	
$html = new simple_html_dom();

if($_ECHO) echo "START: Fetching site map...<br />";

for( $i = 0; $i < $numSitemapPages; $i++ )
{
	if($_ECHO) echo "Page $i<br />";

	$fileContents = file_get_contents( "http://www.website.co.uk/SiteMap-S" . $i . ".aspx" );
	$html->load( $fileContents );

	$hrefs = $html->find( "a[style=color: Blue; text-decoration: underline;]" );

	if ( isset( $hrefs[ 0 ] ) )
	{
		foreach( $hrefs as $href )
		{						
			$url = "http://www.website.co.uk/" . $href->href;
			$qry = "INSERT INTO PLSITEMAP (url) VALUES( '$url' )";

			mysql_query( $qry, $con );

			if($_ECHO) echo "MYSQL: Added $href->href to DB<br />";
		}
	}
	else
		if($_ECHO) echo "NO URLS FOUND ON THIS PAGE!<br />";
}

echo "END: Fetching site map...<br />";

exit(0);

if($_ECHO) echo "MYSQL: Cleaning DB list...<br />";

// This should filter out most non-printer products
$qry = 	"SELECT * FROM `SITE` where 
			url not like '%coax%' and
			url not like '%brochure%' and
			url not like '%cabinet%' and
			url not like '%kit%' and
			url not like '%yellow%' and
			url not like '%magenta%' and
			url not like '%cyan%' and
			url not like '%warranty%' and
			url not like '%service%' and
			url not like '%zebra%' and
			url not like '%simm%' and
			url not like '%dimm%' and
			url not like '%memory%' and
			url not like '%ribbon%' and
			url not like '%cartridge%' and
			url not like '%paper%' and
			url not like '%transparency%' and
			url not like '%hard-disk%' and
			url not like '%year%' and
			url not like '%-Sheet-%' and
			url not like '%Fuser%' and
			url not like '%Imaging%' and
			url not like '%print-head%' and
			url not like '%Duplex-Unit%' and
			url not like '%black-image%' and
			url not like '%unit%' and
			url not like '%transfer-%' and
			url not like '%maintenance%' and
			url not like '%spindle%' and
			url not like '%mailbox%' and
			url not like '%acessory%' and
			url not like '%barcode%' and
			url not like '%toner%' and
			url not like '%label%' and
			url not like '%feeder%' and
			url not like '%server%' and
			url not like '%tape%' and
			url not like '%ex-demo%' and
			url not like '%opened%' and
			url not like '%creased%' and
			(url like '%brother%' or url like '%canon%' or url like '%dell%' or url like '%epson%' or url like '%hp%' or url like '%konica%' or url like '%kyocera%' or url like '%lexmark%' or url like '%oki%' or url like '%panasonic%' or url like '%ricoh%' or url like '%samsung%' or url like '%tally%' or url like '%xante%' or url like '%xerox%')";

if($_ECHO) echo "MYSQL: Exporting printer list...<br />";

$result = mysql_query ( $qry, $con );

$fp = fopen( $outputFileName, "w" );

while ( $row = mysql_fetch_assoc( $result ) ) 
{
	fputs( $fp, $row["url"] . "\n" );
}

fclose( $fp );

if($_ECHO) echo "MYSQL: Done exporting printer list...<br />";

mysql_close( $con );	

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.