2010-11-09 6 views
0

Salut je veux créer un robot web en java dans lequel je veux retrive certaines données comme le titre, la description de la page Web et de stocker les données dans la base de donnéesComment créer un robot d'indexation Web dans Java?

+0

J'aime HtmlUnit, mais je ne sais pas comment cela fonctionnerait sur Android ... – MatrixFrog

+0

Dites-moi comment utiliser HtmlUnit pour créer un robot d'indexation sur Internet. Premièrement, je veux analyser certaines données et les stocker en db. –

Répondre

2

Si vous voulez faire votre propre usage du HttpClient inclus dans le API Android.

Exemple d'utilisation de HttpClient (il vous suffit d'analyser le:

public class HttpTest { 
    public static void main(String... args) 
    throws ClientProtocolException, IOException { 
     crawlPage("http://www.google.com/"); 
    } 

    static Set<String> checked = new HashSet<String>(); 

    private static void crawlPage(String url) throws ClientProtocolException, IOException { 

     if (checked.contains(url)) 
      return; 

     checked.add(url); 

     System.out.println("Crawling: " + url); 

     HttpClient client = new DefaultHttpClient(); 
     HttpGet request = new HttpGet("http://www.google.com"); 
     HttpResponse response = client.execute(request); 

     Reader reader = null; 
     try { 
      reader = new InputStreamReader(response.getEntity().getContent()); 

      Links links = new Links(); 
      new ParserDelegator().parse(reader, links, true); 

      for (String link : links.list) 
       if (link.startsWith("http://")) 
        crawlPage(link); 

     } finally { 
      if (reader != null) { 
       try { 
        reader.close(); 
       } catch (IOException e) { 
        e.printStackTrace(); 
       } 
      } 
     } 
    } 



    static class Links extends HTMLEditorKit.ParserCallback { 

     List<String> list = new LinkedList<String>(); 

     public void handleStartTag(HTML.Tag t, MutableAttributeSet a, int pos) { 
      if (t == HTML.Tag.A) 
       list.add(a.getAttribute(HTML.Attribute.HREF).toString()); 
     } 
    } 
} 
1

Vous pouvez utiliser crawler4j Crawler4j est un crawler Java open source qui fournit une interface simple pour l'exploration du Web Vous pouvez configurer un.. .

0

web crawler multi-thread dans quelques heures Vous pouvez utiliser Webcollector: https://github.com/CrawlScript/WebCollector

Demo basé sur Webcollector 2,05:

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler; 
import cn.edu.hfut.dmic.webcollector.model.Links; 
import cn.edu.hfut.dmic.webcollector.model.Page; 
import java.util.regex.Pattern; 
import org.jsoup.nodes.Document; 

/** 
* Crawl news from yahoo news 
* 
* @author hu 
*/ 
public class YahooCrawler extends BreadthCrawler { 

    /** 
    * @param crawlPath crawlPath is the path of the directory which maintains 
    * information of this crawler 
    * @param autoParse if autoParse is true,BreadthCrawler will auto extract 
    * links which match regex rules from pag 
    */ 
    public YahooCrawler(String crawlPath, boolean autoParse) { 
     super(crawlPath, autoParse); 
     /*start page*/ 
     this.addSeed("http://news.yahoo.com/"); 

     /*fetch url like http://news.yahoo.com/xxxxx*/ 
     this.addRegex("http://news.yahoo.com/.*"); 
     /*do not fetch url like http://news.yahoo.com/xxxx/xxx)*/ 
     this.addRegex("-http://news.yahoo.com/.+/.*"); 
     /*do not fetch jpg|png|gif*/ 
     this.addRegex("-.*\\.(jpg|png|gif).*"); 
     /*do not fetch url contains #*/ 
     this.addRegex("-.*#.*"); 
    } 

    @Override 
    public void visit(Page page, Links nextLinks) { 
     String url = page.getUrl(); 
     /*if page is news page*/ 
     if (Pattern.matches("http://news.yahoo.com/.+html", url)) { 
      /*we use jsoup to parse page*/ 
      Document doc = page.getDoc(); 

      /*extract title and content of news by css selector*/ 
      String title = doc.select("h1[class=headline]").first().text(); 
      String content = doc.select("div[class=body yom-art-content clearfix]").first().text(); 

      System.out.println("URL:\n" + url); 
      System.out.println("title:\n" + title); 
      System.out.println("content:\n" + content); 

      /*If you want to add urls to crawl,add them to nextLink*/ 
      /*WebCollector automatically filters links that have been fetched before*/ 
      /*If autoParse is true and the link you add to nextLinks does not match the regex rules,the link will also been filtered.*/ 
      // nextLinks.add("http://xxxxxx.com"); 
     } 
    } 

    public static void main(String[] args) throws Exception { 
     YahooCrawler crawler = new YahooCrawler("crawl", true); 
     crawler.setThreads(50); 
     crawler.setTopN(100); 
     //crawler.setResumable(true); 
     /*start crawl with depth of 4*/ 
     crawler.start(4); 
    } 

} 
Questions connexes