import java.util.*; import java.net.*; import java.io.*;
import javax.swing.text.*; import javax.swing.text.html.*;
/**
* That class implements a reusable spider */
public class Spider {
/**
* A collection of URLs that resulted in an error */
protected Collection workloadError = new ArrayList(3);
/**
* A collection of URLs that are waiting to be processed */
protected Collection workloadWaiting = new ArrayList(3);
/**
* A collection of URLs that were processed */
protected Collection workloadProcessed = new ArrayList(3);
/**
* The class that the spider should report its URLs to */
protected ISpiderReportable report;
/**
* A flag that indicates whether this process * should be canceled */
protected boolean cancel = false;
/**
* The constructor *
* @param report A class that implements the ISpiderReportable * interface, that will receive information that the * spider finds. */
public Spider(ISpiderReportable report) {
this.report = report; }
/**
* Get the URLs that resulted in an error. *
* @return A collection of URL's. */
public Collection getWorkloadError() {
return workloadError; }
/**
* Get the URLs that were waiting to be processed. * You should add one URL to this collection to
* begin the spider. *
* @return A collection of URLs. */
public Collection getWorkloadWaiting() {
return workloadWaiting; }
/**
* Get the URLs that were processed by this spider. *
* @return A collection of URLs. */
public Collection getWorkloadProcessed() {
return workloadProcessed; }
/**
* Clear all of the workloads. */
public void clear() {
getWorkloadError().clear(); getWorkloadWaiting().clear(); getWorkloadProcessed().clear(); }
/**
* Set a flag that will cause the begin
* method to return before it is done. */
public void cancel() {
cancel = true; }
/**
* Add a URL for processing. *
* @param url */
public void addURL(URL url) {
if ( getWorkloadWaiting().contains(url) ) return;
if ( getWorkloadError().contains(url) ) return;
if ( getWorkloadProcessed().contains(url) ) return;
log(\to workload: \+ url ); getWorkloadWaiting().add(url); }
/**
* Called internally to process a URL *
* @param url The URL to be processed. */
public void processURL(URL url) {
try {
log(\\+ url ); // get the URL's contents
URLConnection connection = url.openConnection(); if ( (connection.getContentType()!=null) &&
!connection.getContentType().toLowerCase().startsWith(\) { getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url);
log(\processing because content type is: \+ connection.getContentType() ); return; }
// read the URL
InputStream is = connection.getInputStream(); Reader r = new InputStreamReader(is); // parse the URL
HTMLEditorKit.Parser parse = new HTMLParse().getParser(); parse.parse(r,new Parser(url),true); } catch ( IOException e ) {
getWorkloadWaiting().remove(url); getWorkloadError().add(url); log(\\+ url ); report.spiderURLError(url); return; }
// mark URL as complete
getWorkloadWaiting().remove(url); getWorkloadProcessed().add(url); log(\\+ url );
网络爬虫Java实现原理
![](/skin/haowen/images/icon_star.png)
![](/skin/haowen/images/icon_star.png)
![](/skin/haowen/images/icon_star.png)
![](/skin/haowen/images/icon_star.png)