<title>Podgląd robota indeksującego wyszukiwarek</title>
</head>

<body>

<form name=mainform action="" method="get">
<table border="0" width="100%" align=center>
<tr>
<td>Wprowadź adres URL: <br>
<input type="text" name="url" size="20"></td>
</tr>
<tr>
<td>
<input type="submit" value="Kliknij, aby zobaczyć podgląd robota indeksującego" name="submit"></td>
</tr>
</table>
</form>
<hr>

<?php
$myurl = $_GET['url'];
if (isset($myurl)) {
print spiderViewer($myurl);
}

?>

</body>
</html>

<?php
function spiderViewer($url) {
$finalHTML='';
if($url) {
$originalHTML=get_content($url);
if($originalHTML) {
$finalHTML.='<table border="0" align="center" width="75%">';
$finalHTML.='<tr><td align="center" valign="top">';
$finalHTML.='<b>Spider View for URL:' . $url . '</b></tr>';
$finalHTML.='<tr><td align="left" valign="top">';
$originalHTML=preg_replace('/<script.*?>.*?<\/script.*?>/sim','',$originalHTML);
$originalHTML=preg_replace('/<object.*?>.*?<\/object.*?>/sim','',$originalHTML);
$originalHTML=preg_replace('/<applet.*?>.*?<\/applet.*?>/sim','',$originalHTML);
$originalHTML=preg_replace('/<style.*?>.*?<\/style.*?>/sim','',$originalHTML);
$originalHTML=preg_replace('/<.*?>/sim','',$originalHTML);
$originalHTML=preg_replace('/&[#]{0,1}.[^ ]*;/sim',' ',$originalHTML);
$stopWordsArray=explode("<br />",file_get_contents('stopwords.txt'));

for($tmploop=0;$tmploop<count($stopWordsArray);$tmploop++) {
$originalHTML=preg_replace('/[\W]{1,1}' . $stopWordsArray[$tmploop] . '[\W]{1,1}/sim','',$originalHTML);
}

$originalHTML=preg_replace('/[^A-Z0-9a-z\.\?\!\;\,\-\r\n ]*/sim','',$originalHTML);

$originalHTML=preg_replace('/[\r\n ]{2,1000}/sim',' ',$originalHTML);

$finalHTML.= $originalHTML . '</td></tr></table>';
} else {
$finalHTML='Sprawdź swój adres URL.';
}
} else {
$finalHTML='Wprowadzony adres URL jest niepoprawny.';
}
return $finalHTML;
}

function get_content($url)
{
$ch = curl_init();

curl_setopt ($ch, CURLOPT_URL, $url);
curl_setopt ($ch, CURLOPT_HEADER, 0);
curl_setopt($ch, CURLOPT_FAILONERROR, 0);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)');
curl_setopt($ch, CURLOPT_TIMEOUT, 30); //times out after 4s
if(preg_match('/^https:\/\//sim',$url)==true) {
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false);
curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false);
}
ob_start();

curl_exec ($ch);
curl_close ($ch);
$string = ob_get_contents();

ob_end_clean();
return $string;
}

?>