package com.javaweb.spider.processer; import com.javaweb.spider.MyConfigurableSpider; import com.javaweb.spider.domain.SpiderConfig; import com.javaweb.spider.domain.SpiderField; import com.javaweb.spider.downloader.HttpClientDownloader; import com.javaweb.spider.scheduler.CountDownScheduler; import org.apache.commons.collections.CollectionUtils; import us.codecraft.webmagic.pipeline.ConsolePipeline; import us.codecraft.webmagic.proxy.Proxy; import us.codecraft.webmagic.proxy.SimpleProxyProvider; import java.util.LinkedHashMap; import java.util.List; import java.util.concurrent.CopyOnWriteArrayList; /** * 爬虫入口 */ public class DefalutProcessor extends AbstractProcessor { public DefalutProcessor(SpiderConfig config) { super(config); } public DefalutProcessor(SpiderConfig config, String uuid) { super(config, uuid); } /** * 运行爬虫并返回结果 * * @return */ @Override public CopyOnWriteArrayList> execute() { List errors = this.validateModel(config); if (CollectionUtils.isNotEmpty(errors)) { logger.warn("校验不通过!请依据下方提示,检查输入参数是否正确......"); for (String error : errors) { logger.warn(">> " + error); } return null; } List fields = config.getFieldsList(); if(CollectionUtils.isEmpty(fields)){ logger.warn("校验不通过!爬虫字段对应规则未配置!!!"); return null; } CopyOnWriteArrayList> datas = new CopyOnWriteArrayList<>(); MyConfigurableSpider spider = MyConfigurableSpider.create(this, config, uuid); spider.addUrl(config.getEntryUrlsList().toArray(new String[0])) .setScheduler(new CountDownScheduler(config)) .setPipelines(config.getPipelineList()) .addPipeline((resultItems, task) -> this.processData(resultItems, datas, spider)); // 收集数据并返回 /*if("cms_article".equals(config.getTableName())){ ArticlePipeline articlePipeline=new ArticlePipeline(); spider.addPipeline(articlePipeline); }else if("cms_book".equals(config.getTableName())){ }else{ }*/ spider.setDownloader(new HttpClientDownloader()) .thread(config.getThreadCount().intValue()); if(config.getShowLog()==1){ spider.addPipeline(new ConsolePipeline()); } //设置抓取代理IP if (config.getUseProxy()==1 && !CollectionUtils.isEmpty(config.getProxyList())) { HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); SimpleProxyProvider provider = SimpleProxyProvider.from(config.getProxyList().toArray(new Proxy[0])); httpClientDownloader.setProxyProvider(provider); spider.setDownloader(httpClientDownloader); } // 测试代理 /*HttpClientDownloader httpClientDownloader = new HttpClientDownloader(); SimpleProxyProvider provider = SimpleProxyProvider.from( new Proxy("61.135.217.7", 80) ); httpClientDownloader.setProxyProvider(provider); spider.setDownloader(httpClientDownloader);*/ // 启动爬虫 spider.run(); return datas; } }