好文档 - 专业文书写作范文服务资料分享网站

网络爬虫Java实现原理

天下 分享 时间: 加入收藏 我要投稿 点赞

begin_actionPerformed(event); } }

/**

* Called when the begin or cancel buttons are clicked *

* @param event The event associated with the button. */

void begin_actionPerformed(java.awt.event.ActionEvent event) {

if ( backgroundThread==null ) { begin.setLabel(\

backgroundThread = new Thread(this); backgroundThread.start(); goodLinksCount=0; badLinksCount=0; } else {

spider.cancel(); }

} /**

* Perform the background thread operation. This method * actually starts the background thread. */

public void run() { try {

errors.setText(\

spider = new Spider(this); spider.clear();

base = new URL(url.getText()); spider.addURL(base); spider.begin();

Runnable doLater = new Runnable() {

public void run() {

begin.setText(\ } };

SwingUtilities.invokeLater(doLater); backgroundThread=null;

} catch ( MalformedURLException e ) { UpdateErrors err = new UpdateErrors(); err.msg = \address.\ SwingUtilities.invokeLater(err);

} }

/**

* Called by the spider when a URL is found. It is here * that links are validated. *

* @param base The page that the link was found on. * @param url The actual link address. */

public boolean spiderFoundURL(URL base,URL url)

{

UpdateCurrentStats cs = new UpdateCurrentStats(); cs.msg = url.toString(); SwingUtilities.invokeLater(cs);

if ( !checkLink(url) ) {

UpdateErrors err = new UpdateErrors(); err.msg = url+\page \+ base + \ SwingUtilities.invokeLater(err); badLinksCount++; return false; }

goodLinksCount++;

if ( !url.getHost().equalsIgnoreCase(base.getHost()) ) return false; else

return true; }

/**

* Called when a URL error is found *

* @param url The URL that resulted in an error. */

public void spiderURLError(URL url) { }

/**

* Called internally to check whether a link is good *

* @param url The link that is being checked.

* @return True if the link was good, false otherwise. */

protected boolean checkLink(URL url) { try {

URLConnection connection = url.openConnection(); connection.connect(); return true;

} catch ( IOException e ) { return false; } }

/**

* Called when the spider finds an e-mail address *

* @param email The email address the spider found. */

public void spiderFoundEMail(String email) { } /**

* Internal class used to update the error information * in a Thread-Safe way */

class UpdateErrors implements Runnable { public String msg; public void run() {

errors.append(msg); } } /**

* Used to update the current status information * in a \way */

class UpdateCurrentStats implements Runnable { public String msg; public void run() {

current.setText(\Processing: \+ msg );

goodLinksLabel.setText(\Links: \+ goodLinksCount); badLinksLabel.setText(\Links: \+ badLinksCount); } } }

2.ISpiderReportable .java import java.net.*;

interface ISpiderReportable {

public boolean spiderFoundURL(URL base,URL url); public void spiderURLError(URL url); public void spiderFoundEMail(String email); }

3.Spider .java

网络爬虫Java实现原理

begin_actionPerformed(event);}}/***Calledwhenthebeginorcancelbuttonsareclicked**@parameventTheeventassociatedwiththebutton.
推荐度:
点击下载文档文档为doc格式
0463u8hrgx9da6a52izb
领取福利

微信扫码领取福利

微信扫码分享