• 设为首页
  • 点击收藏
  • 手机版
    手机扫一扫访问
    迪恩网络手机版
  • 关注官方公众号
    微信扫一扫关注
    迪恩网络公众号

simple-spider: web网络爬虫,精简,快速上手;Jsoup实现,支持cssQuery语法,支持爬 ...

原作者: [db:作者] 来自: 网络 收藏 邀请

开源软件名称:

simple-spider

开源软件地址:

https://gitee.com/mier520/simple-spder

开源软件介绍:

simple-spider

web网络爬虫,精简,快速上手;Jsoup实现,支持cssQuery语法,支持爬取数据到数据域(Bean对象)的动态和静态转化,支持IP代理,支持数据爬取和解析时监控,;完整示例:简单使用示例,小说爬取示例,IP代理池爬取示例;

技术交流群:1134935268

输入图片说明

扫码进群

QQ群输入图片说明微信群输入图片说明)

实战项目

simple-spider在项目中的运用实例,核心爬虫部分完全由simple-spider处理,项目地址:::http://www.51fjing.com

项目新成,还望大伙赏脸多用用:http://www.51fjing.com

项目新成,还望大伙赏脸多用用:http://www.51fjing.com

使用说明

真的太简单了,上手就看demo,二次开发就看源码;

示例

  1. 极简示例 - 单页面数据爬取并立即获得
    /**     * 同步爬取     * 等待并获得爬取数据     */    private static void spiderNow() throws UnsupportedEncodingException {        //目标地址        String url = "http://www.530p.com/s/" + URLEncoder.encode("明天下" ,"UTF-8");        //动态组成爬取页面数据内容,配置字段含义可参考com.qianxun.spider.bean.SpiderBean 和 com.qianxun.spider.bean.SpiderBeanField        SpiderBean spiderBean = SpiderBeanBuilder.newly()                .fieldBuilder().fieldName("bookName").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter1 > a").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("bookHref").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter1 > a").valueKey("href").buildFieldAndAdd()                .fieldBuilder().fieldName("lastChapterName").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter2 > a").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("lastChapterHref").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter2 > a").valueKey("href").buildFieldAndAdd()                .fieldBuilder().fieldName("author").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter4").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("lastUpdateTime").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter3").valueKey("text").buildFieldAndAdd()                .build();                //爬取目标数据        Object obj = Spider.get(spiderBean ,url);        System.out.println(obj);    }
  1. 异步爬取示例 -
    /**     * 异步爬取     * 通过自定义的SpiderMonitor监管数据爬取情况     * 通过自定义的SpiderPipeline处理爬取并解析后的数据     */    private static void syncSpider() {        //爬取内容动态组装,少复杂可忽略;        SpiderBean spiderBean = SpiderBeanBuilder.newly().beanClass(Book.class)                .fieldBuilder().fieldName("name").selector("body > div:nth-child(3) > div.tna > a").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("author").selector("body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(3)").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("lastChapterUpdateTime").selector("body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(7)").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("lastChapterName").selector("body > div:nth-child(3) > table > tbody > tr:nth-child(2) > td > a").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("lastChapterHref").addTaskUrl().selector("body > div:nth-child(3) > table > tbody > tr:nth-child(2) > td > a").valueKey("href").buildFieldAndAdd()                .fieldBuilder().fieldName("description").selector("body > div:nth-child(3) > table > tbody > tr:nth-child(4) > td").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("cover").addTaskUrl().selector("body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(1) > img").valueKey("src").buildFieldAndAdd()                .fieldBuilder().fieldName("chapterList").collection(true).spiderBean(SpiderBeanBuilder.newly().beanClass(Chapter.class)                        .fieldBuilder().fieldName("chapterName").collection(true).selector("body > div:nth-child(3) > div.conter > div > a").valueKey("text").buildFieldAndAdd()                        .fieldBuilder().fieldName("chapterHref").collection(true).selector("body > div:nth-child(3) > div.conter > div > a").valueKey("href").buildFieldAndAdd()                        .build()).buildFieldAndAdd()                .build();        //配置SpiderBean对应的解析url地址模板、爬取数据监听、解析数据监听、以及爬取数据管道输出        //SpiderBeanConfig.urlTemplate :设定SpiderBean解析的页面模板url地址,即爬取的页面URl地址与该地址匹配时,才运用该spiderBean进行内容解析;详情查看该字段的注释        //ConsoleSpiderMonitor : 数据监听        //ConsoleSpiderPipeline : 爬取数据输出管道        SpiderBeanConfig spiderBeanConfig = SpiderBeanConfig.newly()                .spiderBean(spiderBean).urlTemplate("http://www.530p.com/qihuan/shinue-9534/")                .pipeline(new ConsoleSpiderPipeline()).monitor(new ConsoleSpiderMonitor());        //异步爬取:setUrl 配置起始爬取地址        Spider.newly().debug(true).config(spiderBeanConfig).setUrl("http://www.530p.com/qihuan/shinue-9534/").syncStart();    }
  1. 爬取数据到Bean对象的静态转化示例 - 注解
    private static void spiderBeanClassNow(){        Object obj = Spider.get(SpiderBeanBuilder.newly().buildClass(Book.class) ,"http://www.530p.com/qihuan/shinue-9534/");        System.out.println(obj);    }        @Data    @com.qianxun.spider.annote.SpiderBean    public static class Chapter {        @SpiderBeanField(collection = true ,selector = "body > div:nth-child(3) > div.conter > div > a" ,valueKey = "text")        private String chapterName;        @SpiderBeanField(collection = true ,selector = "body > div:nth-child(3) > div.conter > div > a" ,valueKey = "href")        private String chapterHref;        public Chapter() {        }    }    @Data    @Accessors(chain = true)    @com.qianxun.spider.annote.SpiderBean    public static class Book {        @SpiderBeanField(selector = "body > div:nth-child(3) > div.tna > a" ,valueKey = "text")        private String name;        @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(3)" ,valueKey = "text")        private String author;        @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(1) > img" ,valueKey = "src")        private String cover;        @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(4) > td" ,valueKey = "text")        private String description;        @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(2) > td > a" ,valueKey = "text")        private String lastChapterName;        @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(2) > td > a" ,valueKey = "text")        private String lastChapterHref;        @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(7)" ,valueKey = "text")        private String lastChapterUpdateTime;        @SpiderBeanField(spiderBeanClass = Chapter.class ,collection = true)        private List<Chapter> chapterList;    }
  1. 代理爬取 - IP代理池
 private static void proxyTest(){        //代理池配置        DefaultHttpProxyPool proxyPool = new DefaultHttpProxyPool(3 ,true ,false);        proxyPool.addLast(new HttpProxy("47.88.7.18" ,8088) ,15 * 60 * 60 * 1000);        proxyPool.addLast(new HttpProxy("161.35.124.128" ,3128) ,15 * 60 * 60 * 1000);        proxyPool.addLast(new HttpProxy("157.245.221.254" ,8080) ,15 * 60 * 60 * 1000);        SpiderBean spiderBean = SpiderBeanBuilder.newly()                .fieldBuilder().fieldName("ip").selector("title").valueKey("text").buildFieldAndAdd().build();        SpiderBeanConfig spiderBeanConfig = SpiderBeanConfig.newly().urlTemplate("*")                .pipeline(new ConsoleSpiderPipeline())                .monitor(new ConsoleSpiderMonitor()).spiderBean(spiderBean);        //爬虫相关配置,详情参见字段注释        SpiderConfig spiderConfig = new SpiderConfig();        spiderConfig.setConnTimeoutMillis(1000).setHttpProxyPool(proxyPool);                Spider.newly().setUrl("http://202020.ip138.com/").config(spiderBeanConfig).config(spiderConfig.setDebug(false)).intervalMillis(1000).syncStart();    }
  1. 实用示例(可直接用于项目) - IP代理池动态组建与动态更新
@Slf4jpublic class ProxyIpSpiderDemo {    private DefaultHttpProxyPool pool = new DefaultHttpProxyPool(10 ,false ,false);    private AtomicBoolean spiderRunStatus = new AtomicBoolean(false);    public void startSpider() {        if(isActive()){            return;        }        this.spiderRunStatus.set(true);        try {            SchedulerUtil.schedule(this::innerStartSpider);        }catch (Throwable e){            new Thread(this::innerStartSpider).start();        }    }    public void stopSpider() {        this.spiderRunStatus.set(false);    }    public boolean isActive() {        return this.spiderRunStatus.get();    }    public HttpProxyPool getHttpProxyPool(int retry , boolean localRetry , boolean localPriority) {        return new HttpProxyPool() {            @Override            public HttpProxy next() {                if(isEmpty()){                    //如果代理池数据为空,通知管理员                    log.info("代理IP池数量为空" ,"代理IP池数量为空 ,代理爬虫状态:" + (isActive() ? "运行中" : "挂掉"));                }                return pool.next();            }            @Override            public int retry() {                return retry;            }            @Override            public boolean localRetry() {                return localRetry;            }            @Override            public boolean localPriority() {                return localPriority;            }            @Override            public void makeFailure(HttpProxy httpProxy) {                pool.makeFailure(httpProxy);            }            @Override            public boolean isEmpty() {                return pool.isEmpty();            }            @Override            public int size() {                return pool.size();            }        };    }    private void innerStartSpider() {        System.out.println("---代理IP爬虫启动,准备爬取...");        //数据爬取失败次数        int failedCount = 0;        //失败宽限次数,,当达到100次以上时,通知管理员,规避代理ip不可用造成的业务异常        final int failedGrace = 100;        while (this.spiderRunStatus.get()){            try {                long sleepTimeMillis;                ProxyIpInfoBean proxyIpInfoBean = spiderProxyIp();                if (proxyIpInfoBean == null) {                    sleepTimeMillis = TimeUnit.SECONDS.toMillis(5);                    if(++failedCount > failedGrace){                        failedCount = 0;                        log.error("代理IP爬取失败");                        System.err.println("代理IP爬取失败");                    }                }else{                    sleepTimeMillis = sleepTimeMillis(proxyIpInfoBean.getUpdateTimeMillis());                    //IP过滤                    //测试url地址                    final String testUrl = "http://www.baidu.com";                    //最大延迟容忍时间                    final int timeout = 5000;                    //废弃代理IP清除标志,用于清除上一次存入的代理IP,因为其多数已不可用                    AtomicBoolean clearBeforeFlag = new AtomicBoolean(false);                    proxyIpInfoBean.getIpList().forEach(proxyIpBean -> {                        try {                            int statusCode = Jsoup.connect(testUrl).timeout(timeout).proxy(proxyIpBean.getHost() ,proxyIpBean.getPort()).execute().statusCode();                            if(statusCode >= 200 && statusCode < 300){                                if(!clearBeforeFlag.get()){                                    System.err.println("--- older proxy ip was clear");                                    clearBeforeFlag.set(true);                                    this.pool.clear();                                }                                this.pool.addLast(new HttpProxy(proxyIpBean.getHost() ,proxyIpBean.getPort()) ,sleepTimeMillis);                                System.err.println("--- spider available proxy ip: " + proxyIpBean.getHost());                            }                        } catch (IOException e) {                            //请求异常,该代理IP不可用                        }                    });                }                log.info("--代理IP爬虫即将进入休眠时间,预计休眠:" + sleepTimeMillis + "毫秒");                System.out.println("--代理IP爬虫即将进入休眠时间,预计休眠:" + sleepTimeMillis + "毫秒");                //爬取失败,休眠5秒重新爬取                synchronized (Thread.currentThread()){                    Thread.currentThread().wait(sleepTimeMillis);                }            } catch (InterruptedException e){                //线程中断异常                break;            }        }        System.err.println("---- 代理IP爬虫已退出服务");        this.spiderRunStatus.set(false);    }    private long sleepTimeMillis(long updateTimeMillis){        //proxy list 15分钟更新一次,目前以该网站ip作为主要ip进行更新        return TimeUnit.MINUTES.toMillis(15) - (System.currentTimeMillis() - updateTimeMillis);    }    private ProxyIpInfoBean spiderProxyIp(){        // proxy list 网站 15分钟更新一次        SpiderBean spiderBean = SpiderBeanBuilder.newly()                .fieldBuilder().fieldName("ipInfoJsonList").collection(true)                .selector("table.highlight.tab-size.js-file-line-container > tbody > tr > td:nth-child(2)").valueKey("text").buildFieldAndAdd()                .fieldBuilder().fieldName("updateTime").selector("body > div.application-main > div > main > div.container-xl.clearfix.new-discussion-timeline.px-3.px-md-4.px-lg-5 > div > div.Box.d-flex.flex-column.flex-shrink-0.mb-3 > div.Box-header.Box-header--blue.Details.js-details-container > div > div.flex-1.d-flex.flex-items-center.ml-3.min-width-0 > div > span:nth-child(2) > a")                .valueKey("text").buildFieldAndAdd().build();        Object o = Spider.get(spiderBean,"https://github.com/fate0/proxylist/blob/master/proxy.list");        if(o != null){            try {                Map<String, Object> objectMap = ((Map<String, Object>) o);                List<Object> ipInfoList = (List<Object>) objectMap.get("ipInfoJsonList");                List<ProxyIpBean> proxyIpBeans = new ArrayList<>(ipInfoList.size());                ipInfoList.forEach(info -> {                    JSONObject jsonObject = JSON.parseObject(info.toString());                    ProxyIpBean proxyIpBean = new ProxyIpBean();                    proxyIpBean.setHost(jsonObject.getString("host")).setPort(jsonObject.getInteger("port"))                            .setResponseTimeMillis((long) (jsonObject.getDouble("response_time") * 1000));                    proxyIpBeans.add(proxyIpBean);                });                Collections.sort(proxyIpBeans, (o1, o2) -> o1.responseTimeMillis > o2.responseTimeMillis ? 1 : (o1.responseTimeMillis == o2.responseTimeMillis ? 0 : -1));                String updateTimeStr = objectMap.get("updateTime").toString();                long updateTimeMillis = ObjectUtil.isBlank(updateTimeStr) ? System.currentTimeMillis() : new Date(updateTimeStr).getTime();                ProxyIpInfoBean proxyIpInfoBean = new ProxyIpInfoBean();                proxyIpInfoBean.setUpdateTimeMillis(updateTimeMillis).setIpList(proxyIpBeans);                return proxyIpInfoBean;            }catch (Throwable throwable){                //防止目标网站数据变更导致数据解析错误,从而导致退出ip爬取                log.error("爬出代理IP出现异常,目标地址:https://github.com/fate0/proxylist/blob/master/proxy.list",throwable);            }        }        return null;    }    @Data    @Accessors(chain = true)    private static class ProxyIpInfoBean {        private long updateTimeMillis;        private List<ProxyIpBean> ipList;    }    @Data    @Accessors(chain = true)    private static class ProxyIpBean {        private String host;        private int port;        private long responseTimeMillis;    }}

鲜花

握手

雷人

路过

鸡蛋
该文章已有0人参与评论

请发表评论

全部评论

专题导读
热门推荐
热门话题
阅读排行榜

扫描微信二维码

查看手机版网站

随时了解更新最新资讯

139-2527-9053

在线客服(服务时间 9:00~18:00)

在线QQ客服
地址:深圳市南山区西丽大学城创智工业园
电邮:jeky_zhao#qq.com
移动电话:139-2527-9053

Powered by 互联科技 X3.4© 2001-2213 极客世界.|Sitemap