Jump to content

In depth cleaning - database field


randall

Recommended Posts

 

I have been trying to hack two parts of code together... I had code writen that will grab the text from a website and completely clean it of all junk except for full words... then echo it.  Now I am trying to use the same script to pull from a database instead of a URL but am lost... Here is my code... I would make another donation to the site if we can get this going... THANK YOU!

 


<?php

$con = mysql_connect("localhost","USERNAME","PASSWORD!");
mysql_select_db("DATABASE",$con);

$get = "SELECT * FROM information_description WHERE information_id=4";
$SQ_query = mysql_query($get) or die("Query failed: $get\n" . mysql_error());
$fetch = mysql_fetch_array($SQ_query);

$raw = $fetch['description'];


/* Set internal character encoding to UTF-8 */
mb_internal_encoding("UTF-8");
mb_http_output( "UTF-8" );
ob_start("mb_output_handler");

function clean($html) {
###Remove number in html################
//$html  = preg_replace("/[0-9]/", " ", $html);


$html =  preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $html);

//	$html = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $html);
echo $html;

$html = str_replace(" ", " ", $html);
$html = str_replace("&", " ", $html);
$html = str_replace("-", " ", $html);
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\s\s+/', ', ', $html);
$html =  preg_replace('/[\s\W]+/',' ',$html);   // Strip off spaces and non-alpha-numeric 
return $html;
}

#call function
//$raw = StripHtmlTags($raw);
$raw = clean($raw);
echo $raw;

##echo clean($html);

$url = (isset($_GET['url']) ?$_GET['url'] : 0);
$str = file_get_contents($url);
####################################################################3
function get_url_contents($url){
        $crl = curl_init();
        $timeout = 5;
        curl_setopt ($crl, CURLOPT_URL,$url);
        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);
        $ret = curl_exec($crl);
        curl_close($crl);
        return $ret;
}
#--------------------------------------Strip html tag----------------------------------------------------
function StripHtmlTags( $text )
{
  // PHP's strip_tags() function will remove tags, but it
  // doesn't remove scripts, styles, and other unwanted
  // invisible text between tags.  Also, as a prelude to
  // tokenizing the text, we need to insure that when
  // block-level tags (such as <p> or <div>) are removed,
  // neighboring words aren't joined.
  $text = preg_replace(
    array(
      // Remove invisible content
      '@<head[^>]*?>.*?</head>@siu',
      '@<style[^>]*?>.*?</style>@siu',
      '@<script[^>]*?.*?</script>@siu',
      '@<object[^>]*?.*?</object>@siu',
      '@<embed[^>]*?.*?</embed>@siu',
      '@<applet[^>]*?.*?</applet>@siu',
      '@<noframes[^>]*?.*?</noframes>@siu',
      '@<noscript[^>]*?.*?</noscript>@siu',
      '@<noembed[^>]*?.*?</noembed>@siu',

      // Add line breaks before & after blocks
      '@<((br)|(hr))@iu',
      '@</?((address)|(blockquote)|(center)|(del))@iu',
      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
      '@</?((table)|(th)|(td)|(caption))@iu',
      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
      '@</?((frameset)|(frame)|(iframe))@iu',
    ),
    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );

  // Remove all remaining tags and comments and return.
  return strtolower( $text );
}

function RemoveComments( & $string )
{
  $string = preg_replace("%(#|;|(//)).*%","",$string);
  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead
  return $string;
}


$html = StripHtmlTags($str);

###Remove number in html################
$html  = preg_replace("/[0-9]/", " ", $html);

#replace   by ' '
$html = str_replace(" ", " ", $html);

######remove any words################

$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach($remove_word as $word) {
$html = preg_replace("/\b". $word ."\b/", " ", $html);
}
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\b\s+/', ', ', $html);
$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 

#remove white space, Keep : . ( ) : &
//$html = preg_replace('/\s+/', ', ', $html);


###process#########################################################################
$array_loop = explode(",", $html);
$array_loop1 = $array_loop;
$arr_tem = array();

foreach($array_loop as $key=>$val) {
if(in_array($val, $array_loop1)) {
	if(!$arr_tem[$val]) $arr_tem[$val] = 0;
	$arr_tem[$val] += 1;

	if ( ($k = array_search($val, $array_loop1) ) !== false )
	unset($array_loop1[$k]);
}
}

arsort($arr_tem);

###echo top 20 words############################################################
echo "<h3>Top 20 words used most</h3>";
$i = 1;
foreach($arr_tem as $key=>$val) {
if($i<=20) {
	echo $i.":  ".$key." (".$val." words)<br />";
	$i++;
}else break;
}
echo "<hr />";
###print array#####################################################################
echo (implode(", ", array_keys($arr_tem)));

?>

Link to comment
Share on other sites

 

 

 

 

Lets use this code example instead... not much different but it pulls info...

 

Here is the output

http://salesleadhq.com/mien/new.php

 


<?php

$con = mysql_connect("localhost","USERNAME","PASSWORD!");
mysql_select_db("DATABASE",$con);

$get = "SELECT * FROM information_description WHERE information_id=4";
$SQ_query = mysql_query($get) or die("Query failed: $get\n" . mysql_error());
$fetch = mysql_fetch_array($SQ_query);

$raw = $fetch['description'];


/* Set internal character encoding to UTF-8 */
mb_internal_encoding("UTF-8");
mb_http_output( "UTF-8" );
ob_start("mb_output_handler");

function clean($html) {
###Remove number in html################
//$html  = preg_replace("/[0-9]/", " ", $html);


$html =  preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $html);

//	$html = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $html);
echo $html;

$html = str_replace(" ", " ", $html);
$html = str_replace("&", " ", $html);
$html = str_replace("-", " ", $html);
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\s\s+/', ', ', $html);
$html =  preg_replace('/[\s\W]+/',' ',$html);   // Strip off spaces and non-alpha-numeric 
return $html;
}

#call function
//$raw = StripHtmlTags($raw);
$raw = clean($raw);
echo $raw;

##echo clean($html);

$url = (isset($_GET['url']) ?$_GET['url'] : 0);
$str = file_get_contents($url);
####################################################################3
function get_url_contents($url){
        $crl = curl_init();
        $timeout = 5;
        curl_setopt ($crl, CURLOPT_URL,$url);
        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);
        $ret = curl_exec($crl);
        curl_close($crl);
        return $ret;
}
#--------------------------------------Strip html tag----------------------------------------------------
function StripHtmlTags( $text )
{
  // PHP's strip_tags() function will remove tags, but it
  // doesn't remove scripts, styles, and other unwanted
  // invisible text between tags.  Also, as a prelude to
  // tokenizing the text, we need to insure that when
  // block-level tags (such as <p> or <div>) are removed,
  // neighboring words aren't joined.
  $text = preg_replace(
    array(
      // Remove invisible content
      '@<head[^>]*?>.*?</head>@siu',
      '@<style[^>]*?>.*?</style>@siu',
      '@<script[^>]*?.*?</script>@siu',
      '@<object[^>]*?.*?</object>@siu',
      '@<embed[^>]*?.*?</embed>@siu',
      '@<applet[^>]*?.*?</applet>@siu',
      '@<noframes[^>]*?.*?</noframes>@siu',
      '@<noscript[^>]*?.*?</noscript>@siu',
      '@<noembed[^>]*?.*?</noembed>@siu',

      // Add line breaks before & after blocks
      '@<((br)|(hr))@iu',
      '@</?((address)|(blockquote)|(center)|(del))@iu',
      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
      '@</?((table)|(th)|(td)|(caption))@iu',
      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
      '@</?((frameset)|(frame)|(iframe))@iu',
    ),
    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );

  // Remove all remaining tags and comments and return.
  return strtolower( $text );
}

function RemoveComments( & $string )
{
  $string = preg_replace("%(#|;|(//)).*%","",$string);
  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead
  return $string;
}


$html = StripHtmlTags($str);

###Remove number in html################
$html  = preg_replace("/[0-9]/", " ", $html);

#replace   by ' '
$html = str_replace(" ", " ", $html);

######remove any words################

$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach($remove_word as $word) {
$html = preg_replace("/\b". $word ."\b/", " ", $html);
}
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\b\s+/', ', ', $html);
$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 

#remove white space, Keep : . ( ) : &
//$html = preg_replace('/\s+/', ', ', $html);


###process#########################################################################
$array_loop = explode(",", $html);
$array_loop1 = $array_loop;
$arr_tem = array();

foreach($array_loop as $key=>$val) {
if(in_array($val, $array_loop1)) {
	if(!$arr_tem[$val]) $arr_tem[$val] = 0;
	$arr_tem[$val] += 1;

	if ( ($k = array_search($val, $array_loop1) ) !== false )
	unset($array_loop1[$k]);
}
}

arsort($arr_tem);

###echo top 20 words############################################################
echo "<h3>Top 20 words used most</h3>";
$i = 1;
foreach($arr_tem as $key=>$val) {
if($i<=20) {
	echo $i.":  ".$key." (".$val." words)<br />";
	$i++;
}else break;
}
echo "<hr />";
###print array#####################################################################
echo (implode(", ", array_keys($arr_tem)));



?>


 

 

 

Link to comment
Share on other sites

Anyone want $20.00? I just want it to work so I can move on...

 

This works without database... it pulls from the url

http://salesleadhq.com/tools/crawler/meta.php?url=http://www.boormanarchery.com

 

The complete code for that file is below this message. It works perfectly...

 

I thought I was over complicating it as well and spent quite a while figuring this out... I tried "strip_tags()" and it left me with a bunch of non human readable text. All I want left over are full english words, (not leftovers) and I want to echo those full words... I want the words that apear more often to be listed in order. I dont know what I am doing at this point... I have been trying to get it done for a few weeks.

 

I have a text file that contains words that I want to omitt as well.

Please feel sorry for me.  :)

 

thanks in advance!

 

<?php




$url = (isset($_GET['url']) ?$_GET['url'] : 0);

$str = file_get_contents($url);

####################################################################3

function get_url_contents($url){

        $crl = curl_init();

        $timeout = 5;

        curl_setopt ($crl, CURLOPT_URL,$url);

        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);

        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);

        $ret = curl_exec($crl);

        curl_close($crl);

        return $ret;

}

#--------------------------------------Strip html tag----------------------------------------------------

function StripHtmlTags( $text )

{

  // PHP's strip_tags() function will remove tags, but it

  // doesn't remove scripts, styles, and other unwanted

  // invisible text between tags.  Also, as a prelude to

  // tokenizing the text, we need to insure that when

  // block-level tags (such as <p> or <div>) are removed,

  // neighboring words aren't joined.

  $text = preg_replace(

    array(

      // Remove invisible content

      '@<head[^>]*?>.*?</head>@siu',

      '@<style[^>]*?>.*?</style>@siu',

      '@<script[^>]*?.*?</script>@siu',

      '@<object[^>]*?.*?</object>@siu',

      '@<embed[^>]*?.*?</embed>@siu',

      '@<applet[^>]*?.*?</applet>@siu',

      '@<noframes[^>]*?.*?</noframes>@siu',

      '@<noscript[^>]*?.*?</noscript>@siu',

      '@<noembed[^>]*?.*?</noembed>@siu',



      // Add line breaks before & after blocks

      '@<((br)|(hr))@iu',

      '@</?((address)|(blockquote)|(center)|(del))@iu',

      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',

      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',

      '@</?((table)|(th)|(td)|(caption))@iu',

      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',

      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',

      '@</?((frameset)|(frame)|(iframe))@iu',

    ),

    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );



  // Remove all remaining tags and comments and return.

  return strtolower( $text );

}



function RemoveComments( & $string )

{

  $string = preg_replace("%(#|;|(//)).*%","",$string);

  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead

  return $string;

}





$html = StripHtmlTags($str);



###Remove number in html################

$html  = preg_replace("/[0-9]/", " ", $html);



#replace   by ' '

$html = str_replace(" ", " ", $html);



######remove any words################



$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);

foreach($remove_word as $word) {

$html = preg_replace("/\b". $word ."\b/", " ", $html);

}

######remove space

$html =  preg_replace ('/<[^>]*>/', '', $html);



$html =  preg_replace('/\b\s+/', ', ', $html);

$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 



#remove white space, Keep : . ( ) : &

//$html = preg_replace('/\s+/', ', ', $html);





###process#########################################################################

$array_loop = explode(",", $html);

$array_loop1 = $array_loop;

$arr_tem = array();



foreach($array_loop as $key=>$val) {

if(in_array($val, $array_loop1)) {

	if(!$arr_tem[$val]) $arr_tem[$val] = 0;

	$arr_tem[$val] += 1;



	if ( ($k = array_search($val, $array_loop1) ) !== false )

	unset($array_loop1[$k]);

}

}



arsort($arr_tem);



###echo top 20 words############################################################

echo "<h3>Top 20 words used most</h3>";

$i = 1;

foreach($arr_tem as $key=>$val) {

if($i<=20) {

	echo $i.":  ".$key." (".$val." words)<br />";

	$i++;

}else break;

}

echo "<hr />";

###print array#####################################################################

echo (implode(", ", array_keys($arr_tem)));



?>

 

 

Link to comment
Share on other sites

This thread is more than a year old. Please don't revive it unless you have something important to add.

Join the conversation

You can post now and register later. If you have an account, sign in now to post with your account.

Guest
Reply to this topic...

×   Pasted as rich text.   Restore formatting

  Only 75 emoji are allowed.

×   Your link has been automatically embedded.   Display as a link instead

×   Your previous content has been restored.   Clear editor

×   You cannot paste images directly. Upload or insert images from URL.

×
×
  • Create New...

Important Information

We have placed cookies on your device to help make this website better. You can adjust your cookie settings, otherwise we'll assume you're okay to continue.