网络爬虫Java实现原理

由天下分享时间：2025/2/14 8:06:40 加入收藏我要投稿点赞

import java.util.*; import java.net.*; import java.io.*;

import javax.swing.text.*; import javax.swing.text.html.*;

/**

* That class implements a reusable spider */

public class Spider {

/**

* A collection of URLs that resulted in an error */

protected Collection workloadError = new ArrayList(3);

/**

* A collection of URLs that are waiting to be processed */

protected Collection workloadWaiting = new ArrayList(3);

/**

* A collection of URLs that were processed */

protected Collection workloadProcessed = new ArrayList(3);

/**

* The class that the spider should report its URLs to */

protected ISpiderReportable report;

/**

* A flag that indicates whether this process * should be canceled */

protected boolean cancel = false;

/**

* The constructor *

* @param report A class that implements the ISpiderReportable * interface, that will receive information that the * spider finds. */

public Spider(ISpiderReportable report) {

this.report = report; }

/**

* Get the URLs that resulted in an error. *

* @return A collection of URL's. */

public Collection getWorkloadError() {

return workloadError; }

/**

* Get the URLs that were waiting to be processed. * You should add one URL to this collection to

* begin the spider. *

* @return A collection of URLs. */

public Collection getWorkloadWaiting() {

return workloadWaiting; }

/**

* Get the URLs that were processed by this spider. *

* @return A collection of URLs. */

public Collection getWorkloadProcessed() {

return workloadProcessed; }

/**

* Clear all of the workloads. */

public void clear() {

getWorkloadError().clear(); getWorkloadWaiting().clear(); getWorkloadProcessed().clear(); }

/**

* Set a flag that will cause the begin

* method to return before it is done. */

public void cancel() {

cancel = true; }

/**

* Add a URL for processing. *

* @param url */

public void addURL(URL url) {

if ( getWorkloadWaiting().contains(url) ) return;

if ( getWorkloadError().contains(url) ) return;

if ( getWorkloadProcessed().contains(url) ) return;

log(\to workload: \+ url ); getWorkloadWaiting().add(url); }

/**

* Called internally to process a URL *

* @param url The URL to be processed. */

public void processURL(URL url) {

try {

log(\\+ url ); // get the URL's contents

URLConnection connection = url.openConnection(); if ( (connection.getContentType()!=null) &&

!connection.getContentType().toLowerCase().startsWith(\) { getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url);

log(\processing because content type is: \+ connection.getContentType() ); return; }

// read the URL

InputStream is = connection.getInputStream(); Reader r = new InputStreamReader(is); // parse the URL

HTMLEditorKit.Parser parse = new HTMLParse().getParser(); parse.parse(r,new Parser(url),true); } catch ( IOException e ) {

getWorkloadWaiting().remove(url); getWorkloadError().add(url); log(\\+ url ); report.spiderURLError(url); return; }

// mark URL as complete

getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log(\\+ url );

网络爬虫Java实现原理

importjava.util.*;importjava.net.*;importjava.io.*;importjavax.swing.text.*;importjavax.swing.text.html.*;/***Thatclassimplementsar

推荐度：

点击下载文档文档为doc格式

网络爬虫Java实现原理

网络爬虫Java实现原理

相关推荐文档

精选图文

热门排序

推荐文章

热门标签

相关文章列表