|
这个题目有点大。
下面这个是我抓取某个网站的文章内容的。不过是针对性很强的,假如对方修改了一些代码,抓取就会发生变化了。
要做通用的,不容易啊!
放出来看看,希望对你有用。 - <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
- "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
- <html xmlns="http://www.w3.org/1999/xhtml" xml:lang="gb2312" >
- <head>
- <meta http-equiv="content-type" content="text/html; charset=gb2312" />
- <meta name="author" content="Smiling Dolphin" />
- <meta name="keywords" content="design, css, cascading, style, sheets, xhtml, graphic design, w3c, web standards, visual, display, java, javascript, c++, php, jsp, asp, py, pl" />
- <meta name="description" content="my favorites language." />
- <meta name="robots" content="all" />
- <title>文章读取器,收集器 (用于收藏)</title>
- <style type="text/css" title="currentStyle" media="screen">
- body{font-size:14px;}
- #aurl{width:60%;}
- #content{background:#efefef;border:1px dashed #ccc;padding:5px;}
- p{text-indent:2em;}
- </style>
- </head>
- <body>
- <div><h2>无广告,可复制文章读取器</h2></div>
- <div>
- <form method="post" action="" onsubmit="return chkform(this)">
- 请输入文章地址:<input type="text" name="aurl" id="aurl" value="<?php
- if($_POST['aurl'])
- {
- echo $_POST['aurl'];
- }else{
- echo "http://www.dzwx.net/2006/Article/10262.html";
- }
- ?>" /> <input type="submit" name="submitbtn1" value="view" />
- </form>
- </div>
- <div id="content">
- <?php
- if($_POST['submitbtn1']=="view"){
- $filename = "data/".preg_replace("/.*?(\d{1,8})\..*/","\\1",$str).".xml";
- if(!file_exists($filename)){
- $fp = @fopen($_POST['aurl'],"r");
- if($fp){
- while(!@feof($fp)){
- $content .= @fread($fp,4096);
- }
- @fclose($fp);
- $article = array();
- function parseArticle($str,$var='title'){
- global $article;
- $article[$var] = $str;
- }
- preg_replace("/<div id="artitleb">(.*?)<\/div>/ies", "parseArticle('\\1', 'title')", $content);
- preg_match("/<div class="artitlec">.*?<\/div>/ies",$content,$introinfo);
- preg_replace("/<a href="mailto\:.*?>(.*?)<\/a>/ies", "parseArticle('\\1', 'author')", $introinfo[0]);
- preg_match("/<div class="artitled" id="ArticleBody">.*?<div id="artline">/ies", $content, $artbody);
- $artbody = preg_replace("/<span style='text-align : center;DISPLAY: none;'>.*?<\/span>/is", "", $artbody[0]);
- $artbody = preg_replace("/<div align=right style='color=gray'>.*?<div id="artline">/is", "", $artbody);
- $artbody = strip_tags($artbody);
- $artbody = preg_replace("/ /","|",preg_replace("/\s/","",$artbody));
- $article['content'] = "<p>".preg_replace("/\|{1,10}| /","\n</p>\n<p>\n",$artbody)."\n</p>";
- if(!is_dir("./data")) @mkdir("./data");
- $xmlstr = '<?xml version="1.0" encoding="gb2312"?>'."\n";
- $xmlstr.= '<article>'."\n";
- $xmlstr.= ' <title>'.$article['title'].'</title>'."\n";
- $xmlstr.= ' <link>'.$_POST['aurl'].'</link>'."\n";
- $xmlstr.= ' <author>'.$article['author'].'</author>'."\n";
- $xmlstr.= ' <content>'.<![CDATA[$article['content']]]>.'</content>'."\n";
- $xmlstr.= '</article>';
- if($lfp = @fopen($filename,"w")){
- @fwrite($lfp,$xmlstr);
- @fclose($lfp);
- }
- }
- }else{
-
- }
- }
- ?>
- </div>
- <script language="javascript" type="text/javascript">
- function chkform(obj){
- if(!obj.aurl.value)
- {
- alert('请输入文章的地址!');
- return false;
- }
- return true;
- }
- function $(sname){
- return document.getElementById(sname);
- }
- window.onload = function(){
- if($("a_title"))
- document.title = $("a_title").innerHTML + " " + $("a_author").innerHTML;
- }
- </script>
- </body>
- </html>
复制代码 |
|