In depth cleaning - database field

randall · March 24, 2012

I have been trying to hack two parts of code together... I had code writen that will grab the text from a website and completely clean it of all junk except for full words... then echo it. Now I am trying to use the same script to pull from a database instead of a URL but am lost... Here is my code... I would make another donation to the site if we can get this going... THANK YOU!


<?php

$con = mysql_connect("localhost","USERNAME","PASSWORD!");
mysql_select_db("DATABASE",$con);

$get = "SELECT * FROM information_description WHERE information_id=4";
$SQ_query = mysql_query($get) or die("Query failed: $get\n" . mysql_error());
$fetch = mysql_fetch_array($SQ_query);

$raw = $fetch['description'];


/* Set internal character encoding to UTF-8 */
mb_internal_encoding("UTF-8");
mb_http_output( "UTF-8" );
ob_start("mb_output_handler");

function clean($html) {
###Remove number in html################
//$html  = preg_replace("/[0-9]/", " ", $html);


$html =  preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $html);

//	$html = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $html);
echo $html;

$html = str_replace(" ", " ", $html);
$html = str_replace("&", " ", $html);
$html = str_replace("-", " ", $html);
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\s\s+/', ', ', $html);
$html =  preg_replace('/[\s\W]+/',' ',$html);   // Strip off spaces and non-alpha-numeric 
return $html;
}

#call function
//$raw = StripHtmlTags($raw);
$raw = clean($raw);
echo $raw;

##echo clean($html);

$url = (isset($_GET['url']) ?$_GET['url'] : 0);
$str = file_get_contents($url);
####################################################################3
function get_url_contents($url){
        $crl = curl_init();
        $timeout = 5;
        curl_setopt ($crl, CURLOPT_URL,$url);
        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);
        $ret = curl_exec($crl);
        curl_close($crl);
        return $ret;
}
#--------------------------------------Strip html tag----------------------------------------------------
function StripHtmlTags( $text )
{
  // PHP's strip_tags() function will remove tags, but it
  // doesn't remove scripts, styles, and other unwanted
  // invisible text between tags.  Also, as a prelude to
  // tokenizing the text, we need to insure that when
  // block-level tags (such as <p> or <div>) are removed,
  // neighboring words aren't joined.
  $text = preg_replace(
    array(
      // Remove invisible content
      '@<head[^>]*?>.*?</head>@siu',
      '@<style[^>]*?>.*?</style>@siu',
      '@<script[^>]*?.*?</script>@siu',
      '@<object[^>]*?.*?</object>@siu',
      '@<embed[^>]*?.*?</embed>@siu',
      '@<applet[^>]*?.*?</applet>@siu',
      '@<noframes[^>]*?.*?</noframes>@siu',
      '@<noscript[^>]*?.*?</noscript>@siu',
      '@<noembed[^>]*?.*?</noembed>@siu',

      // Add line breaks before & after blocks
      '@<((br)|(hr))@iu',
      '@</?((address)|(blockquote)|(center)|(del))@iu',
      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
      '@</?((table)|(th)|(td)|(caption))@iu',
      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
      '@</?((frameset)|(frame)|(iframe))@iu',
    ),
    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );

  // Remove all remaining tags and comments and return.
  return strtolower( $text );
}

function RemoveComments( & $string )
{
  $string = preg_replace("%(#|;|(//)).*%","",$string);
  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead
  return $string;
}


$html = StripHtmlTags($str);

###Remove number in html################
$html  = preg_replace("/[0-9]/", " ", $html);

#replace   by ' '
$html = str_replace(" ", " ", $html);

######remove any words################

$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach($remove_word as $word) {
$html = preg_replace("/\b". $word ."\b/", " ", $html);
}
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\b\s+/', ', ', $html);
$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 

#remove white space, Keep : . ( ) : &
//$html = preg_replace('/\s+/', ', ', $html);


###process#########################################################################
$array_loop = explode(",", $html);
$array_loop1 = $array_loop;
$arr_tem = array();

foreach($array_loop as $key=>$val) {
if(in_array($val, $array_loop1)) {
	if(!$arr_tem[$val]) $arr_tem[$val] = 0;
	$arr_tem[$val] += 1;

	if ( ($k = array_search($val, $array_loop1) ) !== false )
	unset($array_loop1[$k]);
}
}

arsort($arr_tem);

###echo top 20 words############################################################
echo "<h3>Top 20 words used most</h3>";
$i = 1;
foreach($arr_tem as $key=>$val) {
if($i<=20) {
	echo $i.":  ".$key." (".$val." words)<br />";
	$i++;
}else break;
}
echo "<hr />";
###print array#####################################################################
echo (implode(", ", array_keys($arr_tem)));

?>

AyKay47 · March 24, 2012

It would help if you showed the actual results after the "clean" function is ran on $raw.

randall · March 24, 2012

Lets use this code example instead... not much different but it pulls info...

Here is the output

http://salesleadhq.com/mien/new.php


<?php

$con = mysql_connect("localhost","USERNAME","PASSWORD!");
mysql_select_db("DATABASE",$con);

$get = "SELECT * FROM information_description WHERE information_id=4";
$SQ_query = mysql_query($get) or die("Query failed: $get\n" . mysql_error());
$fetch = mysql_fetch_array($SQ_query);

$raw = $fetch['description'];


/* Set internal character encoding to UTF-8 */
mb_internal_encoding("UTF-8");
mb_http_output( "UTF-8" );
ob_start("mb_output_handler");

function clean($html) {
###Remove number in html################
//$html  = preg_replace("/[0-9]/", " ", $html);


$html =  preg_replace("/<([a-z][a-z0-9]*)[^>]*?(\/?)>/i",'<$1$2>', $html);

//	$html = preg_replace('/(<[^>]+) style=".*?"/i', '$1', $html);
echo $html;

$html = str_replace(" ", " ", $html);
$html = str_replace("&", " ", $html);
$html = str_replace("-", " ", $html);
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\s\s+/', ', ', $html);
$html =  preg_replace('/[\s\W]+/',' ',$html);   // Strip off spaces and non-alpha-numeric 
return $html;
}

#call function
//$raw = StripHtmlTags($raw);
$raw = clean($raw);
echo $raw;

##echo clean($html);

$url = (isset($_GET['url']) ?$_GET['url'] : 0);
$str = file_get_contents($url);
####################################################################3
function get_url_contents($url){
        $crl = curl_init();
        $timeout = 5;
        curl_setopt ($crl, CURLOPT_URL,$url);
        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);
        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);
        $ret = curl_exec($crl);
        curl_close($crl);
        return $ret;
}
#--------------------------------------Strip html tag----------------------------------------------------
function StripHtmlTags( $text )
{
  // PHP's strip_tags() function will remove tags, but it
  // doesn't remove scripts, styles, and other unwanted
  // invisible text between tags.  Also, as a prelude to
  // tokenizing the text, we need to insure that when
  // block-level tags (such as <p> or <div>) are removed,
  // neighboring words aren't joined.
  $text = preg_replace(
    array(
      // Remove invisible content
      '@<head[^>]*?>.*?</head>@siu',
      '@<style[^>]*?>.*?</style>@siu',
      '@<script[^>]*?.*?</script>@siu',
      '@<object[^>]*?.*?</object>@siu',
      '@<embed[^>]*?.*?</embed>@siu',
      '@<applet[^>]*?.*?</applet>@siu',
      '@<noframes[^>]*?.*?</noframes>@siu',
      '@<noscript[^>]*?.*?</noscript>@siu',
      '@<noembed[^>]*?.*?</noembed>@siu',

      // Add line breaks before & after blocks
      '@<((br)|(hr))@iu',
      '@</?((address)|(blockquote)|(center)|(del))@iu',
      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',
      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',
      '@</?((table)|(th)|(td)|(caption))@iu',
      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',
      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',
      '@</?((frameset)|(frame)|(iframe))@iu',
    ),
    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );

  // Remove all remaining tags and comments and return.
  return strtolower( $text );
}

function RemoveComments( & $string )
{
  $string = preg_replace("%(#|;|(//)).*%","",$string);
  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead
  return $string;
}


$html = StripHtmlTags($str);

###Remove number in html################
$html  = preg_replace("/[0-9]/", " ", $html);

#replace   by ' '
$html = str_replace(" ", " ", $html);

######remove any words################

$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);
foreach($remove_word as $word) {
$html = preg_replace("/\b". $word ."\b/", " ", $html);
}
######remove space
$html =  preg_replace ('/<[^>]*>/', '', $html);

$html =  preg_replace('/\b\s+/', ', ', $html);
$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 

#remove white space, Keep : . ( ) : &
//$html = preg_replace('/\s+/', ', ', $html);


###process#########################################################################
$array_loop = explode(",", $html);
$array_loop1 = $array_loop;
$arr_tem = array();

foreach($array_loop as $key=>$val) {
if(in_array($val, $array_loop1)) {
	if(!$arr_tem[$val]) $arr_tem[$val] = 0;
	$arr_tem[$val] += 1;

	if ( ($k = array_search($val, $array_loop1) ) !== false )
	unset($array_loop1[$k]);
}
}

arsort($arr_tem);

###echo top 20 words############################################################
echo "<h3>Top 20 words used most</h3>";
$i = 1;
foreach($arr_tem as $key=>$val) {
if($i<=20) {
	echo $i.":  ".$key." (".$val." words)<br />";
	$i++;
}else break;
}
echo "<hr />";
###print array#####################################################################
echo (implode(", ", array_keys($arr_tem)));



?>

AyKay47 · March 24, 2012

I think you are over complicating this.

First, to remove any tags found in the string, you can simply use strip_tags which will leav you with the content inside of the tags.

Then you can do any other sanitization you want.

randall · March 24, 2012

Anyone want $20.00? I just want it to work so I can move on...

This works without database... it pulls from the url

http://salesleadhq.com/tools/crawler/meta.php?url=http://www.boormanarchery.com

The complete code for that file is below this message. It works perfectly...

I thought I was over complicating it as well and spent quite a while figuring this out... I tried "strip_tags()" and it left me with a bunch of non human readable text. All I want left over are full english words, (not leftovers) and I want to echo those full words... I want the words that apear more often to be listed in order. I dont know what I am doing at this point... I have been trying to get it done for a few weeks.

I have a text file that contains words that I want to omitt as well.

Please feel sorry for me.

thanks in advance!

<?php




$url = (isset($_GET['url']) ?$_GET['url'] : 0);

$str = file_get_contents($url);

####################################################################3

function get_url_contents($url){

        $crl = curl_init();

        $timeout = 5;

        curl_setopt ($crl, CURLOPT_URL,$url);

        curl_setopt ($crl, CURLOPT_RETURNTRANSFER, 1);

        curl_setopt ($crl, CURLOPT_CONNECTTIMEOUT, $timeout);

        $ret = curl_exec($crl);

        curl_close($crl);

        return $ret;

}

#--------------------------------------Strip html tag----------------------------------------------------

function StripHtmlTags( $text )

{

  // PHP's strip_tags() function will remove tags, but it

  // doesn't remove scripts, styles, and other unwanted

  // invisible text between tags.  Also, as a prelude to

  // tokenizing the text, we need to insure that when

  // block-level tags (such as <p> or <div>) are removed,

  // neighboring words aren't joined.

  $text = preg_replace(

    array(

      // Remove invisible content

      '@<head[^>]*?>.*?</head>@siu',

      '@<style[^>]*?>.*?</style>@siu',

      '@<script[^>]*?.*?</script>@siu',

      '@<object[^>]*?.*?</object>@siu',

      '@<embed[^>]*?.*?</embed>@siu',

      '@<applet[^>]*?.*?</applet>@siu',

      '@<noframes[^>]*?.*?</noframes>@siu',

      '@<noscript[^>]*?.*?</noscript>@siu',

      '@<noembed[^>]*?.*?</noembed>@siu',



      // Add line breaks before & after blocks

      '@<((br)|(hr))@iu',

      '@</?((address)|(blockquote)|(center)|(del))@iu',

      '@</?((div)|(h[1-9])|(ins)|(isindex)|(p)|(pre))@iu',

      '@</?((dir)|(dl)|(dt)|(dd)|(li)|(menu)|(ol)|(ul))@iu',

      '@</?((table)|(th)|(td)|(caption))@iu',

      '@</?((form)|(button)|(fieldset)|(legend)|(input))@iu',

      '@</?((label)|(select)|(optgroup)|(option)|(textarea))@iu',

      '@</?((frameset)|(frame)|(iframe))@iu',

    ),

    array(' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', ' ', "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0", "\n\$0",),$text );



  // Remove all remaining tags and comments and return.

  return strtolower( $text );

}



function RemoveComments( & $string )

{

  $string = preg_replace("%(#|;|(//)).*%","",$string);

  $string = preg_replace("%/\*(??!\*/).)*\*/%s","",$string); // google for negative lookahead

  return $string;

}





$html = StripHtmlTags($str);



###Remove number in html################

$html  = preg_replace("/[0-9]/", " ", $html);



#replace   by ' '

$html = str_replace(" ", " ", $html);



######remove any words################



$remove_word = file("swords.txt", FILE_IGNORE_NEW_LINES | FILE_SKIP_EMPTY_LINES);

foreach($remove_word as $word) {

$html = preg_replace("/\b". $word ."\b/", " ", $html);

}

######remove space

$html =  preg_replace ('/<[^>]*>/', '', $html);



$html =  preg_replace('/\b\s+/', ', ', $html);

$html =  preg_replace('/[\b\W]+/',', ',$html);   // Strip off spaces and non-alpha-numeric 



#remove white space, Keep : . ( ) : &

//$html = preg_replace('/\s+/', ', ', $html);





###process#########################################################################

$array_loop = explode(",", $html);

$array_loop1 = $array_loop;

$arr_tem = array();



foreach($array_loop as $key=>$val) {

if(in_array($val, $array_loop1)) {

	if(!$arr_tem[$val]) $arr_tem[$val] = 0;

	$arr_tem[$val] += 1;



	if ( ($k = array_search($val, $array_loop1) ) !== false )

	unset($array_loop1[$k]);

}

}



arsort($arr_tem);



###echo top 20 words############################################################

echo "<h3>Top 20 words used most</h3>";

$i = 1;

foreach($arr_tem as $key=>$val) {

if($i<=20) {

	echo $i.":  ".$key." (".$val." words)<br />";

	$i++;

}else break;

}

echo "<hr />";

###print array#####################################################################

echo (implode(", ", array_keys($arr_tem)));



?>

Sign In

In depth cleaning - database field

Recommended Posts

randall

Link to comment

Share on other sites

AyKay47

Link to comment

Share on other sites

randall

Link to comment

Share on other sites

AyKay47

Link to comment

Share on other sites

randall

Link to comment

Share on other sites

Join the conversation

Browse

Activity

Important Information