网络爬虫Java实现原理

由天下分享时间：2025/3/20 16:04:27 加入收藏我要投稿点赞

} /**

* Called to start the spider */

public void begin() {

cancel = false;

while ( !getWorkloadWaiting().isEmpty() && !cancel ) { Object list[] = getWorkloadWaiting().toArray(); for ( int i=0;(i

/**

* A HTML parser callback used by this class to detect links *

* @author wuhailin * @version 1.0 */

protected class Parser

extends HTMLEditorKit.ParserCallback { protected URL base;

public Parser(URL base) {

this.base = base; }

public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos)

{

String href = (String)a.getAttribute(HTML.Attribute.HREF);

if( (href==null) && (t==HTML.Tag.FRAME) ) href = (String)a.getAttribute(HTML.Attribute.SRC);

if ( href==null ) return;

int i = href.indexOf('#'); if ( i!=-1 )

href = href.substring(0,i);

if ( href.toLowerCase().startsWith(\) { report.spiderFoundEMail(href); return; }

handleLink(base,href); }

public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos) {

handleSimpleTag(t,a,pos); // handle the same way

}

protected void handleLink(URL base,String str) { try {

URL url = new URL(base,str);

if ( report.spiderFoundURL(base,url) ) addURL(url);

} catch ( MalformedURLException e ) { log(\malformed URL: \+ str ); } } }

/**

* Called internally to log information * This basic method just writes the log * out to the stdout. *

* @param entry The information to be written to the log. */

public void log(String entry) {

System.out.println( (new Date()) + \+ entry ); } }

4.HTMLParse .java

import javax.swing.text.html.*;

public class HTMLParse extends HTMLEditorKit { public HTMLEditorKit.Parser getParser() {

return super.getParser(); } }

网络爬虫Java实现原理

}/***Calledtostartthespider*/publicvoidbegin(){cancel=false;while(!getWorkloadWaiting().isEmpty()&&!cancel){Objectlis

推荐度：

点击下载文档文档为doc格式

网络爬虫Java实现原理

网络爬虫Java实现原理

相关推荐文档

精选图文

热门排序

推荐文章

热门标签

相关文章列表