Web Crawler
От blooddrainer / 20.07.2009 15:32
Здравейте, това е примерен скрипт на БОТ, който индексира даден уеб сайт. Използва PHP и JavaScript (AJAX framework - mootools[версия 1.2+]).
Нека започнем с PHP
CODE
И естествено HTML + JavaScript
CODE
Могат да се измислят още доста неща по скрипта, така че очаквайте скоро подобрения!
mootools можете да си изтеглите от http://www.mootools.net
Нека започнем с PHP
CODE
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | /** * @name crawl.php * @author Blood Drainer * @version 1.0.0 * @copyright 2009 */ function crawl($url) { $links = $GLOBALS['totalLinks']; $filter = $GLOBALS['removeText']; if($fp = @fopen($url,"rb")) { unset($url); while(!feof($fp)) $content .= fread($fp, 8192); fclose($fp); preg_match_all("/href="(.*?)"[s|>]/", $content, $matches, PREG_PATTERN_ORDER); unset($content); if(count($matches[1]) > 0) { for($i=0;$i<count($matches[1]);$i++) { foreach($filter as $key => $value) { if(strstr($matches[1][$i], $value)) $matches[1][$i] = ""; } if($matches[1][$i] != "") { if(!in_array($matches[1][$i], $links)) array_push($links, $matches[1][$i]); } } unset($i, $matches); } } } $url = "адресът на сайта, който ще индексирате"; $totalLinks = array(); $removeText = array("mailto:","javascript:",".css","http://","#"); if($_POST['links']) $totalLinks = explode(";",$_POST['links']); if($_POST['position'] > 0) { $cUrl = $totalLinks[$_POST['position']]; $i = $_POST['position']; } else $i = 0; crawl($cUrl); for($l=0;$l<count($totalLinks);$l++) { if($l == (count($totalLinks)-1)) $data .= $totalLinks[$l]; else $data .= $totalLinks[$l].";"; } echo "<script>$('status').setText('в процес...".(count($totalLinks)+1)."')</script>"; if($i < count($totalLinks)) echo "<script>crawl('$data','".($i+1)."')</script>"; else { if($fp = @fopen("sitemap.xml","wb+")) { $data = ""; $data .= "<?xml version="1.0" encoding="UTF-8"?>"; $data .= "<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sitemaps.org/schemas/sitemap/0.9 http://www.sitemaps.org/schemas/sitemap/0.9/sitemap.xsd">n"; $data .= "<url>n"; $data .= " <loc>".$url."</loc>n"; $data .= " <priority>1.00</priority>n"; $data .= " <changefreq>daily</changefreq>n"; $data .= "</url>"; foreach($totalLinks as $value) { $data .= "n<url>n"; $data .= " <loc>".htmlentities($value, ENT_QUOTES, "utf-8")."</loc>n"; $data .= " <priority>0.80</priority>n"; $data .= " <changefreq>daily</changefreq>n"; $data .= "</url>"; } $data .= "n</urlset>"; if(@fwrite($fp, $data) === FALSE) echo "<script>alert('Картата на сайта не може да бъде записана!')</script>"; fclose($fp); } echo "<script>$('status').setText('завършен[".count($totalLinks)."]')</script>"; unset($data, $i, $totalLinks, $url, $cUrl, $removeText); } ?> |
И естествено HTML + JavaScript
CODE
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 | <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head lang="bg"> <title>Web Crawler</title> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <script type="text/javascript" src="ajax.js"></script> <script type="text/javascript" src="ajax-more.js"></script> </head> <body lang="bg"> <div align="center"> <input type="button" value="индексирай" onclick="crawl(0,0)" /> </div> <div id="status"></div> <script type="text/javascript"> function crawl(links, position) { new Request.HTML({url: "crawl.php", data: {links: links, position: position}, onFailure: function() { alert('Заявката не може да бъде изпълнена!'); } }).post(); } </script> </body> </html> |
Могат да се измислят още доста неща по скрипта, така че очаквайте скоро подобрения!
mootools можете да си изтеглите от http://www.mootools.net





