NutzCN Logo
分享 使用nutz dao为webmagic爬虫写数据库,实现最轻的爬虫
发布于 3412天前 作者 fangoxyz 4525 次浏览 复制 下一个帖子
标签: dao

package com.f8fm.spider;

import org.apache.commons.lang3.builder.ToStringBuilder;
import org.nutz.dao.Dao;
import org.nutz.dao.entity.annotation.Table;
import org.nutz.dao.impl.NutDao;

import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.model.AfterExtractor;
import us.codecraft.webmagic.model.OOSpider;
import us.codecraft.webmagic.model.annotation.ExtractBy;
import us.codecraft.webmagic.model.annotation.HelpUrl;
import us.codecraft.webmagic.model.annotation.TargetUrl;
import us.codecraft.webmagic.pipeline.PageModelPipeline;

import com.alibaba.druid.pool.DruidDataSource;

/**
* @author code4crafter@gmail.com
*/
@TargetUrl("http://www.jokeji.cn/jokehtml/jy/\d+.htm")
@HelpUrl("http://www.jokeji.cn/list\w+.htm")
@Table("sp_news")
public class JokejiModel implements AfterExtractor {

public String toString() {
    return title.toString();
}



public String getTitle() {
    return title;
}



public void setTitle(String title) {
    this.title = title;
}



public String getContent() {
    return content;
}



public void setContent(String content) {
    this.content = content;
}



@ExtractBy("//title/regex('<title>([^_]+)',1)")
private String title;

@ExtractBy("//span[@id=text110]/tidyText()")
private String content;

public static DruidDataSource ds;
public static Dao dao;
public static void main(String[] args) {
    /*driverClassName : "com.mysql.jdbc.Driver",
    url : "jdbc:mysql://192.168.0.253:3306/youfang?useUnicode=true&characterEncoding=utf-8",
    username : "root",
    password : ""*/
    ds=new DruidDataSource();
    ds.setDriverClassName( "com.mysql.jdbc.Driver");
    ds.setUrl( "jdbc:mysql://192.168.0.253:3306/youfang?useUnicode=true&characterEncoding=utf-8");
    ds.setUsername("root");
    ds.setPassword("");
    dao= new NutDao(ds);

    OOSpider.create(Site.me().setDomain("www.jokeji.cn").setCharset("gbk").setSleepTime(100).setTimeOut(3000).setUserAgent("Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)")
            , new PageModelPipeline<Object>() {
                public void process(Object o, Task task) {
                    System.out.println(o);
                }
            }, JokejiModel.class)
            .addUrl("http://www.jokeji.cn/")
            .thread(2)
            .run();
    ds.close();
}


public void afterProcess(Page page) { 
    dao.insert(this); 
    System.out.println(page.getUrl());
}
public class ConsolePageModelPipeline implements PageModelPipeline {
    public void process(Object o, Task task) {
        System.out.println(ToStringBuilder.reflectionToString(o));
    }
}

}

5 回复

使用了 nutz 1.b.53以及webmagic

还有durid数据源

<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"
4.0.0
webmagicSpider
com.f8fm.spider
0.0.1-SNAPSHOT


us.codecraft
webmagic-core
0.5.2


us.codecraft
webmagic-extension
0.5.2

 <dependency>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-log4j12</artifactId>
        <version>1.7.12</version>
    </dependency>
    <dependency>
        <groupId>org.slf4j</groupId>
        <artifactId>slf4j-api</artifactId>
        <version>1.7.12</version>
    </dependency>
    <dependency>
        <groupId>log4j</groupId>
        <artifactId>log4j</artifactId>
        <version>1.2.17</version>
    </dependency>

    <dependency>
        <groupId>com.alibaba</groupId>
        <artifactId>druid</artifactId>
        <version>1.0.15</version>
        <exclusions>
            <exclusion>
                <groupId>com.alibaba</groupId>
                <artifactId>jconsole</artifactId>
            </exclusion>
            <exclusion>
                <groupId>com.alibaba</groupId>
                <artifactId>tools</artifactId>
            </exclusion>
        </exclusions>
    </dependency>
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.35</version>
    </dependency>
</dependencies>


当然数据库用的mysql ,mysql驱动是少不了的

添加回复
请先登陆
回到顶部