基于Spindle的增强HTTP Spider

gstarwd

浏览: 1489086 次
性别:
来自: 杭州

最近访客更多访客>>

cl_andywin

sagadan

scj2cy

wangyy

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

HtmlAnalysis

lucene IE Apache Socket .net

zz:http://www.iteye.com/news/1731

构建于lucene之上的可用的Java开源Spider少之又少,spindle长期没有更新且功能不够完善,故而自己参考其源
代码重新编写了一个可扩展的WebCrawler,本着开源共享,共同进步的想法发布于此,期冀得到大家的批评指正,
有任何意见及建议均可Email联系我 (kaninebruno@hotmail.com)
以下代码基于lucene-2.3.1,htmlparser-1.6,je-analysis-1.5.3,以及自己修改过的cpdetector- 1.0.5;
下载地址分别为
htmlparser：http://sourceforge.net/project/showfiles.php?group_id=24399
je-analysis：http://www.jesoft.cn/je-analysis-1.5.3.jar
lucene就不用说了,cpdetector-1.0.5见附件.
spindle的官方站点：http://www.bitmechanic.com/projects/spindle/

Java 代码

package com.huizhi.kanine.util;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.SocketException;
import java.net.SocketTimeoutException;
import java.net.URL;
import java.net.UnknownHostException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import jeasy.analysis.MMAnalyzer;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.DateTools;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.Hits;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.RAMDirectory;
import org.htmlparser.Parser;
import org.htmlparser.PrototypicalNodeFactory;
import org.htmlparser.filters.AndFilter;
import org.htmlparser.filters.HasAttributeFilter;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.BaseHrefTag;
import org.htmlparser.tags.FrameTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.tags.MetaTag;
import org.htmlparser.util.EncodingChangeException;
import org.htmlparser.util.NodeIterator;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.visitors.HtmlPage;
import cpdetector.io.ASCIIDetector;
import cpdetector.io.CodepageDetectorProxy;
import cpdetector.io.JChardetFacade;
import cpdetector.io.ParsingDetector;
import cpdetector.io.UnicodeDetector;
/**
* @author 张波
* E-mail:kaninebruno@hotmail.com
* Created On : 2008-03-30
*/
public class SiteCapturer implements Runnable{
/* 基准(初始)URL */
protected URL mSource;
/* 索引文件的存放位置 */
protected String mTarget;
/**
* 待解析的URL地址集合，所有新检测到的链接均存放于此；
* 解析时按照先入先出（First-In First-Out）法则线性取出
*/
protected ArrayList mPages;
/* 已解析的URL地址集合，避免链接的重复抓取 */
protected HashSet mFinished;
protected Parser mParser;
/* StringBuffer的缓冲区大小 */
protected final int TRANSFER_SIZE = 4096 ;
/* 当前平台的行分隔符 */
protected static String lineSep = System.getProperty( "line.separator" );
/* 程序运行线程数，默认2个线程 */
protected int mthreads;
protected ArrayList threadList;
/* 存储于磁盘的IndexWriter */
protected IndexWriter FSDWriter;
/* 存储于内存的IndexWriter */
protected IndexWriter RAMWriter;
protected IndexSearcher indexSearcher;
protected RAMDirectory ramDirectory;
/* 筛选页面内容的分词器 */
protected Analyzer luceneAnalyzer;
/* 解析页面时的字符编码 */
protected String charset;
/* 统计已抓取的页面数量 */
protected int count = 0 ;
/* 基准端口 */
protected int mPort;
/* 基准主机 */
protected String mHost;
/* 检测索引中是否存在当前URL信息,避免重复抓取 */
protected boolean mCheck;
/* 索引操作的写入线程锁 */
public static final Object indexLock = new Object();
public SiteCapturer() {
mSource = null ;
mTarget = null ;
mthreads = 2 ;
mCheck = false ;
mPages = new ArrayList();
mFinished = new HashSet();
mParser = new Parser();
PrototypicalNodeFactory factory = new PrototypicalNodeFactory();
factory.registerTag(new LocalLinkTag());
factory.registerTag(new LocalFrameTag());
factory.registerTag(new LocalBaseHrefTag());
mParser.setNodeFactory(factory);
}
public String getSource() {
return mSource.toString();
}
public void setSource(String source) {
if (source.endsWith( "/" ))
source = source.substring(0 , source.length() - 1 );
try {
mSource = new URL(source);
} catch (MalformedURLException e) {
System.err.println("Invalid URL : " + getSource());
}
}
public String getTarget() {
return (mTarget);
}
public void setTarget(String target) {
mTarget = target;
}
public int getThreads() {
return (mthreads);
}
public void setThreads( int threads) {
mthreads = threads;
}
public boolean isMCheck() {
return mCheck;
}
public void setMCheck( boolean check) {
mCheck = check;
}
/**
* 程序入口，在此初始化mPages、IndexWriter
* 通过协调各线程间的活动完成website的抓取工作
* 任务完成后将所有的索引片段合并为一个以优化检索
*/
public void capture(){
mPages.clear();
mPages.add(getSource());
int responseCode = 0 ;
String contentType = "" ;
try {
HttpURLConnection uc = (HttpURLConnection) mSource.openConnection();
responseCode = uc.getResponseCode();
contentType = uc.getContentType();
} catch (MalformedURLException mue) {
System.err.println("Invalid URL : " + getSource());
} catch (IOException ie) {
if (ie instanceof UnknownHostException) {
System.err.println("UnknowHost : " + getSource());
} else if (ie instanceof SocketException) {
System.err.println("Socket Error : " + ie.getMessage() + " "
+ getSource());
} else
ie.printStackTrace();
}
if (responseCode == HttpURLConnection.HTTP_OK
&& contentType.startsWith("text/html" )) {
mPort = mSource.getPort();
mHost = mSource.getHost();
charset = autoDetectCharset(mSource);
/* 存放索引文件的位置 */
File indexDir = new File(mTarget);
/* 标记是否重新建立索引,true为重新建立索引 */
boolean flag = true ;
if (!indexDir.exists()) {
/* 如果文件夹不存在则创建 */
indexDir.mkdir();
} else if (IndexReader.indexExists(mTarget)) {
/* 如果已存在索引,则追加索引 */
flag = false ;
File lockfile = new File(mTarget + File.separator + "write.lock" );
if (lockfile.exists())
lockfile.delete();
}
luceneAnalyzer = new MMAnalyzer();
ramDirectory = new RAMDirectory();
try {
FSDWriter = new IndexWriter(indexDir, luceneAnalyzer, flag);
RAMWriter = new IndexWriter(ramDirectory, luceneAnalyzer, true );
while (mCheck) {
IndexReader indexReader = IndexReader.open(mTarget);
indexSearcher = new IndexSearcher(indexReader);
}
long start = System.currentTimeMillis();
threadList = new ArrayList();
for ( int i = 0 ; i < mthreads; i++) {
Thread t = new Thread( this , "K-9 Spider Thread #" + (i + 1 ));
t.start();
threadList.add(t);
}
while (threadList.size() > 0 ) {
Thread child = (Thread) threadList.remove(0 );
try {
child.join();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
long elapsed = System.currentTimeMillis() - start;
RAMWriter.close();
FSDWriter.addIndexes(new Directory[] { ramDirectory });
FSDWriter.optimize();
FSDWriter.close();
System.out.println("Finished in " + (elapsed / 1000 )
+ " seconds" );
System.out.println("The Count of the Links Captured is "
+ count);
} catch (CorruptIndexException cie) {
cie.printStackTrace();
} catch (LockObtainFailedException lofe) {
lofe.printStackTrace();
} catch (IOException ie) {
ie.printStackTrace();
}
}
}
public void run() {
String url;
while ((url = dequeueURL()) != null ) {
if (isToBeCaptured(url))
process(url);
}
mthreads--;
}
/**
* 判断提取到的链接是否符合解析条件；标准为Port及Host与基准URL相同且类型为text/html或text/plain
*/
public boolean isToBeCaptured (String url){
boolean flag = false ;
HttpURLConnection uc = null ;
int responseCode = 0 ;
String contentType = "" ;
String host = "" ;
int port = 0 ;
try {
URL source = new URL(url);
String protocol = source.getProtocol();
if (protocol != null && protocol.equals( "http" )) {
host = source.getHost();
port = source.getPort();
uc = (HttpURLConnection) source.openConnection();
uc.setConnectTimeout(8000 );
responseCode = uc.getResponseCode();
contentType = uc.getContentType();
}
} catch (MalformedURLException mue) {
System.err.println("Invalid URL : " + url);
} catch (IOException ie) {
if (ie instanceof UnknownHostException) {
System.err.println("UnknowHost : " + url);
} else if (ie instanceof SocketException) {
System.err.println("Socket Error : " + ie.getMessage() + " "
+ url);
} else if (ie instanceof SocketTimeoutException) {
System.err.println("Socket Connection Time Out : " + url);
} else if (ie instanceof FileNotFoundException) {
System.err.println("broken link "
+ ((FileNotFoundException) ie.getCause()).getMessage()
+ " ignored" );
} else
ie.printStackTrace();
}
if (port == mPort
&& responseCode == HttpURLConnection.HTTP_OK
&& host.equals(mHost)
&& (contentType.startsWith("text/html" ) || contentType
.startsWith("text/plain" )))
flag = true ;
return flag;
}
/* 从URL队列mPages里取出单个的URL */
public synchronized String dequeueURL() {
while ( true ) {
if (mPages.size() > 0 ) {
String url = (String) mPages.remove(0 );
mFinished.add(url);
if (isToBeCaptured(url)) {
int bookmark;
NodeList list;
NodeList robots;
MetaTag robot;
String content;
try {
bookmark = mPages.size();
/* 获取页面所有节点 */
mParser.setURL(url);
try {
list = new NodeList();
for (NodeIterator e = mParser.elements(); e
.hasMoreNodes();)
list.add(e.nextNode());
} catch (EncodingChangeException ece) {
/* 解码出错的异常处理 */
mParser.reset();
list = new NodeList();
for (NodeIterator e = mParser.elements(); e
.hasMoreNodes();)
list.add(e.nextNode());
}
/**
* 依据 http://www.robotstxt.org/wc/meta-user.html 处理
* Robots tag
*/
robots = list
.extractAllNodesThatMatch(
new AndFilter( new NodeClassFilter(
MetaTag.class ),
new HasAttributeFilter( "name" ,
"robots" )), true );
if ( 0 != robots.size()) {
robot = (MetaTag) robots.elementAt(0 );
content = robot.getAttribute("content" )
.toLowerCase();
if ((- 1 != content.indexOf( "none" ))
|| (-1 != content.indexOf( "nofollow" )))
for ( int i = bookmark; i < mPages.size(); i++)
mPages.remove(i);
}
} catch (ParserException pe) {
pe.printStackTrace();
}
}
return url;
} else {
mthreads--;
if (mthreads > 0 ) {
try {
wait();
mthreads++;
} catch (InterruptedException ie) {
ie.printStackTrace();
}
} else {
notifyAll();
return null ;
}
}
}
}
/**
* 处理单独的URL地址，解析页面并加入到lucene索引中；通过自动探测页面编码保证抓取工作的顺利执行
*/
protected void process(String url) {
String result[];
String content = null ;
String title = null ;
/* 此项操作较耗性能，故默认不予检测 */
if (mCheck) {
try {
TermQuery query = new TermQuery( new Term( "url" , url));
Hits hits = indexSearcher.search(query);
if (hits.length() > 0 ) {
System.out.println("The URL : " + url
+ " has already been captured" );
} else {
result = parseHtml(url, charset);
content = result[0 ];
title = result[1 ];
}
} catch (IOException ie) {
ie.printStackTrace();
}
} else {
result = parseHtml(url, charset);
content = result[0 ];
title = result[1 ];
}
if (content != null && content.trim().length() > 0 ) {
Document document = new Document();
document.add(new Field( "content" , content, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.add(new Field( "url" , url, Field.Store.YES,
Field.Index.UN_TOKENIZED));
document.add(new Field( "title" , title, Field.Store.YES,
Field.Index.TOKENIZED,
Field.TermVector.WITH_POSITIONS_OFFSETS));
document.add(new Field( "date" , DateTools.timeToString( new Date()
.getTime(), DateTools.Resolution.DAY), Field.Store.YES,
Field.Index.UN_TOKENIZED));
synchronized (indexLock) {
try {
RAMWriter.addDocument(document);
/**
* 当存放索引的内存使用大于指定值时将其写入硬盘；采用此方法的目的是
* 通过内存缓冲避免频繁的IO操作，提高索引创建性能；
*/
if (RAMWriter.ramSizeInBytes() > 512 * 1024 ) {
RAMWriter.close();
FSDWriter.addIndexes(new Directory[] { ramDirectory });
RAMWriter = new IndexWriter(ramDirectory,
luceneAnalyzer, true );
}
count++;
System.out.println(Thread.currentThread().getName()
+ ": Finished Indexing URL: " + url);
} catch (CorruptIndexException cie) {
cie.printStackTrace();
} catch (IOException ie) {
ie.printStackTrace();
}
}
}
}
/**
* Link tag that rewrites the HREF.
* The HREF is changed to a local target if it matches the source.
*/
class LocalLinkTag extends LinkTag {
public void doSemanticAction() {
String link = getLink();
if (link.endsWith( "/" ))
link = link.substring(0 , link.length() - 1 );
int pos = link.indexOf( "#" );
if (pos != - 1 )
link = link.substring(0 , pos);
/* 将链接加入到处理队列中 */
if (!(mFinished.contains(link) || mPages.contains(link)))
mPages.add(link);
setLink(link);
}
}
/**
* Frame tag that rewrites the SRC URLs. The SRC URLs are mapped to local
* targets if they match the source.
*/
class LocalFrameTag extends FrameTag {
public void doSemanticAction() {
String link = getFrameLocation();
if (link.endsWith( "/" ))
link = link.substring(0 , link.length() - 1 );
int pos = link.indexOf( "#" );
if (pos != - 1 )
link = link.substring(0 , pos);
/* 将链接加入到处理队列中 */
if (!(mFinished.contains(link) || mPages.contains(link)))
mPages.add(link);
setFrameLocation(link);
}
}
/**
* Base tag that doesn't show. The toHtml() method is overridden to return
* an empty string, effectively shutting off the base reference.
*/
class LocalBaseHrefTag extends BaseHrefTag {
public String toHtml() {
return ( "" );
}
}
/* 自动探测页面编码，避免中文乱码的出现 */
protected String autoDetectCharset(URL url) {
CodepageDetectorProxy detector = CodepageDetectorProxy.getInstance();
/**
* ParsingDetector 可用于检查HTML、XML等文件或字符流的编码
* 构造方法中的参数用于指示是否显示探测过程的详细信息
* 为false则不显示
*/
detector.add(new ParsingDetector( false ));
detector.add(JChardetFacade.getInstance());
detector.add(ASCIIDetector.getInstance());
detector.add(UnicodeDetector.getInstance());
Charset charset = null ;
try {
charset = detector.detectCodepage(url);
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ie) {
ie.printStackTrace();
}
if (charset == null )
charset = Charset.defaultCharset();
return charset.name();
}
/* 按照指定编码解析标准的html页面，为建立索引做准备*/
protected String[] parseHtml(String url, String charset) {
String result[] = null ;
String content = null ;
tr

分享到：

开源网络蜘蛛spider（转载） | Cobra: Java HTML 解析器
- 2010-04-12 15:33
- 浏览 1454
- 评论(0)
- 分类:企业架构
- 查看更多
评论

发表评论

 您还没有登录,请您登录后再发表评论

最近访客更多访客>>

博主相关

文章分类

社区版块

存档分类

最新评论