} /**
* Called to start the spider */
public void begin() {
cancel = false;
while ( !getWorkloadWaiting().isEmpty() && !cancel ) { Object list[] = getWorkloadWaiting().toArray(); for ( int i=0;(i /** * A HTML parser callback used by this class to detect links * * @author wuhailin * @version 1.0 */ protected class Parser extends HTMLEditorKit.ParserCallback { protected URL base; public Parser(URL base) { this.base = base; } public void handleSimpleTag(HTML.Tag t, MutableAttributeSet a,int pos) { String href = (String)a.getAttribute(HTML.Attribute.HREF); if( (href==null) && (t==HTML.Tag.FRAME) ) href = (String)a.getAttribute(HTML.Attribute.SRC); if ( href==null ) return; int i = href.indexOf('#'); if ( i!=-1 ) href = href.substring(0,i); if ( href.toLowerCase().startsWith(\) { report.spiderFoundEMail(href); return; } handleLink(base,href); } public void handleStartTag(HTML.Tag t, MutableAttributeSet a,int pos) { handleSimpleTag(t,a,pos); // handle the same way } protected void handleLink(URL base,String str) { try { URL url = new URL(base,str); if ( report.spiderFoundURL(base,url) ) addURL(url); } catch ( MalformedURLException e ) { log(\malformed URL: \+ str ); } } } /** * Called internally to log information * This basic method just writes the log * out to the stdout. * * @param entry The information to be written to the log. */ public void log(String entry) { System.out.println( (new Date()) + \+ entry ); } } 4.HTMLParse .java import javax.swing.text.html.*; public class HTMLParse extends HTMLEditorKit { public HTMLEditorKit.Parser getParser() { return super.getParser(); } }
网络爬虫Java实现原理



