在线时间:8:00-16:00
迪恩网络APP
随时随地掌握行业动态
扫描二维码
关注迪恩网络微信公众号
开源软件名称:simple-spider开源软件地址:https://gitee.com/mier520/simple-spder开源软件介绍:simple-spiderweb网络爬虫,精简,快速上手;Jsoup实现,支持cssQuery语法,支持爬取数据到数据域(Bean对象)的动态和静态转化,支持IP代理,支持数据爬取和解析时监控,;完整示例:简单使用示例,小说爬取示例,IP代理池爬取示例; 技术交流群:1134935268扫码进群
实战项目simple-spider在项目中的运用实例,核心爬虫部分完全由simple-spider处理,项目地址:::http://www.51fjing.com 项目新成,还望大伙赏脸多用用:http://www.51fjing.com项目新成,还望大伙赏脸多用用:http://www.51fjing.com使用说明真的太简单了,上手就看demo,二次开发就看源码; 示例
/** * 同步爬取 * 等待并获得爬取数据 */ private static void spiderNow() throws UnsupportedEncodingException { //目标地址 String url = "http://www.530p.com/s/" + URLEncoder.encode("明天下" ,"UTF-8"); //动态组成爬取页面数据内容,配置字段含义可参考com.qianxun.spider.bean.SpiderBean 和 com.qianxun.spider.bean.SpiderBeanField SpiderBean spiderBean = SpiderBeanBuilder.newly() .fieldBuilder().fieldName("bookName").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter1 > a").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("bookHref").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter1 > a").valueKey("href").buildFieldAndAdd() .fieldBuilder().fieldName("lastChapterName").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter2 > a").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("lastChapterHref").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter2 > a").valueKey("href").buildFieldAndAdd() .fieldBuilder().fieldName("author").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter4").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("lastUpdateTime").collection(true).selector("body > div:nth-child(3) > div.conter > ul > li.conter3").valueKey("text").buildFieldAndAdd() .build(); //爬取目标数据 Object obj = Spider.get(spiderBean ,url); System.out.println(obj); }
/** * 异步爬取 * 通过自定义的SpiderMonitor监管数据爬取情况 * 通过自定义的SpiderPipeline处理爬取并解析后的数据 */ private static void syncSpider() { //爬取内容动态组装,少复杂可忽略; SpiderBean spiderBean = SpiderBeanBuilder.newly().beanClass(Book.class) .fieldBuilder().fieldName("name").selector("body > div:nth-child(3) > div.tna > a").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("author").selector("body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(3)").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("lastChapterUpdateTime").selector("body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(7)").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("lastChapterName").selector("body > div:nth-child(3) > table > tbody > tr:nth-child(2) > td > a").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("lastChapterHref").addTaskUrl().selector("body > div:nth-child(3) > table > tbody > tr:nth-child(2) > td > a").valueKey("href").buildFieldAndAdd() .fieldBuilder().fieldName("description").selector("body > div:nth-child(3) > table > tbody > tr:nth-child(4) > td").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("cover").addTaskUrl().selector("body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(1) > img").valueKey("src").buildFieldAndAdd() .fieldBuilder().fieldName("chapterList").collection(true).spiderBean(SpiderBeanBuilder.newly().beanClass(Chapter.class) .fieldBuilder().fieldName("chapterName").collection(true).selector("body > div:nth-child(3) > div.conter > div > a").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("chapterHref").collection(true).selector("body > div:nth-child(3) > div.conter > div > a").valueKey("href").buildFieldAndAdd() .build()).buildFieldAndAdd() .build(); //配置SpiderBean对应的解析url地址模板、爬取数据监听、解析数据监听、以及爬取数据管道输出 //SpiderBeanConfig.urlTemplate :设定SpiderBean解析的页面模板url地址,即爬取的页面URl地址与该地址匹配时,才运用该spiderBean进行内容解析;详情查看该字段的注释 //ConsoleSpiderMonitor : 数据监听 //ConsoleSpiderPipeline : 爬取数据输出管道 SpiderBeanConfig spiderBeanConfig = SpiderBeanConfig.newly() .spiderBean(spiderBean).urlTemplate("http://www.530p.com/qihuan/shinue-9534/") .pipeline(new ConsoleSpiderPipeline()).monitor(new ConsoleSpiderMonitor()); //异步爬取:setUrl 配置起始爬取地址 Spider.newly().debug(true).config(spiderBeanConfig).setUrl("http://www.530p.com/qihuan/shinue-9534/").syncStart(); }
private static void spiderBeanClassNow(){ Object obj = Spider.get(SpiderBeanBuilder.newly().buildClass(Book.class) ,"http://www.530p.com/qihuan/shinue-9534/"); System.out.println(obj); } @Data @com.qianxun.spider.annote.SpiderBean public static class Chapter { @SpiderBeanField(collection = true ,selector = "body > div:nth-child(3) > div.conter > div > a" ,valueKey = "text") private String chapterName; @SpiderBeanField(collection = true ,selector = "body > div:nth-child(3) > div.conter > div > a" ,valueKey = "href") private String chapterHref; public Chapter() { } } @Data @Accessors(chain = true) @com.qianxun.spider.annote.SpiderBean public static class Book { @SpiderBeanField(selector = "body > div:nth-child(3) > div.tna > a" ,valueKey = "text") private String name; @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(3)" ,valueKey = "text") private String author; @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(1) > img" ,valueKey = "src") private String cover; @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(4) > td" ,valueKey = "text") private String description; @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(2) > td > a" ,valueKey = "text") private String lastChapterName; @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(2) > td > a" ,valueKey = "text") private String lastChapterHref; @SpiderBeanField(selector = "body > div:nth-child(3) > table > tbody > tr:nth-child(1) > td:nth-child(7)" ,valueKey = "text") private String lastChapterUpdateTime; @SpiderBeanField(spiderBeanClass = Chapter.class ,collection = true) private List<Chapter> chapterList; }
private static void proxyTest(){ //代理池配置 DefaultHttpProxyPool proxyPool = new DefaultHttpProxyPool(3 ,true ,false); proxyPool.addLast(new HttpProxy("47.88.7.18" ,8088) ,15 * 60 * 60 * 1000); proxyPool.addLast(new HttpProxy("161.35.124.128" ,3128) ,15 * 60 * 60 * 1000); proxyPool.addLast(new HttpProxy("157.245.221.254" ,8080) ,15 * 60 * 60 * 1000); SpiderBean spiderBean = SpiderBeanBuilder.newly() .fieldBuilder().fieldName("ip").selector("title").valueKey("text").buildFieldAndAdd().build(); SpiderBeanConfig spiderBeanConfig = SpiderBeanConfig.newly().urlTemplate("*") .pipeline(new ConsoleSpiderPipeline()) .monitor(new ConsoleSpiderMonitor()).spiderBean(spiderBean); //爬虫相关配置,详情参见字段注释 SpiderConfig spiderConfig = new SpiderConfig(); spiderConfig.setConnTimeoutMillis(1000).setHttpProxyPool(proxyPool); Spider.newly().setUrl("http://202020.ip138.com/").config(spiderBeanConfig).config(spiderConfig.setDebug(false)).intervalMillis(1000).syncStart(); }
@Slf4jpublic class ProxyIpSpiderDemo { private DefaultHttpProxyPool pool = new DefaultHttpProxyPool(10 ,false ,false); private AtomicBoolean spiderRunStatus = new AtomicBoolean(false); public void startSpider() { if(isActive()){ return; } this.spiderRunStatus.set(true); try { SchedulerUtil.schedule(this::innerStartSpider); }catch (Throwable e){ new Thread(this::innerStartSpider).start(); } } public void stopSpider() { this.spiderRunStatus.set(false); } public boolean isActive() { return this.spiderRunStatus.get(); } public HttpProxyPool getHttpProxyPool(int retry , boolean localRetry , boolean localPriority) { return new HttpProxyPool() { @Override public HttpProxy next() { if(isEmpty()){ //如果代理池数据为空,通知管理员 log.info("代理IP池数量为空" ,"代理IP池数量为空 ,代理爬虫状态:" + (isActive() ? "运行中" : "挂掉")); } return pool.next(); } @Override public int retry() { return retry; } @Override public boolean localRetry() { return localRetry; } @Override public boolean localPriority() { return localPriority; } @Override public void makeFailure(HttpProxy httpProxy) { pool.makeFailure(httpProxy); } @Override public boolean isEmpty() { return pool.isEmpty(); } @Override public int size() { return pool.size(); } }; } private void innerStartSpider() { System.out.println("---代理IP爬虫启动,准备爬取..."); //数据爬取失败次数 int failedCount = 0; //失败宽限次数,,当达到100次以上时,通知管理员,规避代理ip不可用造成的业务异常 final int failedGrace = 100; while (this.spiderRunStatus.get()){ try { long sleepTimeMillis; ProxyIpInfoBean proxyIpInfoBean = spiderProxyIp(); if (proxyIpInfoBean == null) { sleepTimeMillis = TimeUnit.SECONDS.toMillis(5); if(++failedCount > failedGrace){ failedCount = 0; log.error("代理IP爬取失败"); System.err.println("代理IP爬取失败"); } }else{ sleepTimeMillis = sleepTimeMillis(proxyIpInfoBean.getUpdateTimeMillis()); //IP过滤 //测试url地址 final String testUrl = "http://www.baidu.com"; //最大延迟容忍时间 final int timeout = 5000; //废弃代理IP清除标志,用于清除上一次存入的代理IP,因为其多数已不可用 AtomicBoolean clearBeforeFlag = new AtomicBoolean(false); proxyIpInfoBean.getIpList().forEach(proxyIpBean -> { try { int statusCode = Jsoup.connect(testUrl).timeout(timeout).proxy(proxyIpBean.getHost() ,proxyIpBean.getPort()).execute().statusCode(); if(statusCode >= 200 && statusCode < 300){ if(!clearBeforeFlag.get()){ System.err.println("--- older proxy ip was clear"); clearBeforeFlag.set(true); this.pool.clear(); } this.pool.addLast(new HttpProxy(proxyIpBean.getHost() ,proxyIpBean.getPort()) ,sleepTimeMillis); System.err.println("--- spider available proxy ip: " + proxyIpBean.getHost()); } } catch (IOException e) { //请求异常,该代理IP不可用 } }); } log.info("--代理IP爬虫即将进入休眠时间,预计休眠:" + sleepTimeMillis + "毫秒"); System.out.println("--代理IP爬虫即将进入休眠时间,预计休眠:" + sleepTimeMillis + "毫秒"); //爬取失败,休眠5秒重新爬取 synchronized (Thread.currentThread()){ Thread.currentThread().wait(sleepTimeMillis); } } catch (InterruptedException e){ //线程中断异常 break; } } System.err.println("---- 代理IP爬虫已退出服务"); this.spiderRunStatus.set(false); } private long sleepTimeMillis(long updateTimeMillis){ //proxy list 15分钟更新一次,目前以该网站ip作为主要ip进行更新 return TimeUnit.MINUTES.toMillis(15) - (System.currentTimeMillis() - updateTimeMillis); } private ProxyIpInfoBean spiderProxyIp(){ // proxy list 网站 15分钟更新一次 SpiderBean spiderBean = SpiderBeanBuilder.newly() .fieldBuilder().fieldName("ipInfoJsonList").collection(true) .selector("table.highlight.tab-size.js-file-line-container > tbody > tr > td:nth-child(2)").valueKey("text").buildFieldAndAdd() .fieldBuilder().fieldName("updateTime").selector("body > div.application-main > div > main > div.container-xl.clearfix.new-discussion-timeline.px-3.px-md-4.px-lg-5 > div > div.Box.d-flex.flex-column.flex-shrink-0.mb-3 > div.Box-header.Box-header--blue.Details.js-details-container > div > div.flex-1.d-flex.flex-items-center.ml-3.min-width-0 > div > span:nth-child(2) > a") .valueKey("text").buildFieldAndAdd().build(); Object o = Spider.get(spiderBean,"https://github.com/fate0/proxylist/blob/master/proxy.list"); if(o != null){ try { Map<String, Object> objectMap = ((Map<String, Object>) o); List<Object> ipInfoList = (List<Object>) objectMap.get("ipInfoJsonList"); List<ProxyIpBean> proxyIpBeans = new ArrayList<>(ipInfoList.size()); ipInfoList.forEach(info -> { JSONObject jsonObject = JSON.parseObject(info.toString()); ProxyIpBean proxyIpBean = new ProxyIpBean(); proxyIpBean.setHost(jsonObject.getString("host")).setPort(jsonObject.getInteger("port")) .setResponseTimeMillis((long) (jsonObject.getDouble("response_time") * 1000)); proxyIpBeans.add(proxyIpBean); }); Collections.sort(proxyIpBeans, (o1, o2) -> o1.responseTimeMillis > o2.responseTimeMillis ? 1 : (o1.responseTimeMillis == o2.responseTimeMillis ? 0 : -1)); String updateTimeStr = objectMap.get("updateTime").toString(); long updateTimeMillis = ObjectUtil.isBlank(updateTimeStr) ? System.currentTimeMillis() : new Date(updateTimeStr).getTime(); ProxyIpInfoBean proxyIpInfoBean = new ProxyIpInfoBean(); proxyIpInfoBean.setUpdateTimeMillis(updateTimeMillis).setIpList(proxyIpBeans); return proxyIpInfoBean; }catch (Throwable throwable){ //防止目标网站数据变更导致数据解析错误,从而导致退出ip爬取 log.error("爬出代理IP出现异常,目标地址:https://github.com/fate0/proxylist/blob/master/proxy.list",throwable); } } return null; } @Data @Accessors(chain = true) private static class ProxyIpInfoBean { private long updateTimeMillis; private List<ProxyIpBean> ipList; } @Data @Accessors(chain = true) private static class ProxyIpBean { private String host; private int port; private long responseTimeMillis; }} |
请发表评论