地质所 沉降监测网建设项目
zmk
2024-05-15 9e3afc6d0fa514f986d3fea40fa23124e6fb5070
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
package com.javaweb.spider.processer;
 
import com.javaweb.spider.MyConfigurableSpider;
import com.javaweb.spider.domain.SpiderConfig;
import com.javaweb.spider.domain.SpiderField;
import com.javaweb.spider.downloader.HttpClientDownloader;
import com.javaweb.spider.scheduler.CountDownScheduler;
import org.apache.commons.collections.CollectionUtils;
import us.codecraft.webmagic.pipeline.ConsolePipeline;
import us.codecraft.webmagic.proxy.Proxy;
import us.codecraft.webmagic.proxy.SimpleProxyProvider;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.concurrent.CopyOnWriteArrayList;
 
/**
 * 爬虫入口
 */
public class DefalutProcessor extends AbstractProcessor {
 
    public DefalutProcessor(SpiderConfig config) {
        super(config);
    }
 
    public DefalutProcessor(SpiderConfig config, String uuid) {
        super(config, uuid);
    }
 
    /**
     * 运行爬虫并返回结果
     *
     * @return
     */
    @Override
    public CopyOnWriteArrayList<LinkedHashMap<String, String>> execute() {
        List<String> errors = this.validateModel(config);
        if (CollectionUtils.isNotEmpty(errors)) {
            logger.warn("校验不通过!请依据下方提示,检查输入参数是否正确......");
            for (String error : errors) {
                logger.warn(">> " + error);
            }
            return null;
        }
        List<SpiderField> fields = config.getFieldsList();
        if(CollectionUtils.isEmpty(fields)){
            logger.warn("校验不通过!爬虫字段对应规则未配置!!!");
            return null;
        }
 
        CopyOnWriteArrayList<LinkedHashMap<String, String>> datas = new CopyOnWriteArrayList<>();
        MyConfigurableSpider spider = MyConfigurableSpider.create(this, config, uuid);
 
        spider.addUrl(config.getEntryUrlsList().toArray(new String[0]))
                .setScheduler(new CountDownScheduler(config))
                .setPipelines(config.getPipelineList())
                .addPipeline((resultItems, task) -> this.processData(resultItems, datas, spider)); // 收集数据并返回
 
            /*if("cms_article".equals(config.getTableName())){
                ArticlePipeline articlePipeline=new ArticlePipeline();
                spider.addPipeline(articlePipeline);
            }else if("cms_book".equals(config.getTableName())){
 
            }else{
            }*/
            spider.setDownloader(new HttpClientDownloader())
                .thread(config.getThreadCount().intValue());
        if(config.getShowLog()==1){
            spider.addPipeline(new ConsolePipeline());
        }
        //设置抓取代理IP
        if (config.getUseProxy()==1 && !CollectionUtils.isEmpty(config.getProxyList())) {
            HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
            SimpleProxyProvider provider = SimpleProxyProvider.from(config.getProxyList().toArray(new Proxy[0]));
            httpClientDownloader.setProxyProvider(provider);
            spider.setDownloader(httpClientDownloader);
        }
        // 测试代理
        /*HttpClientDownloader httpClientDownloader = new HttpClientDownloader();
        SimpleProxyProvider provider = SimpleProxyProvider.from(
                new Proxy("61.135.217.7", 80)
        );
        httpClientDownloader.setProxyProvider(provider);
        spider.setDownloader(httpClientDownloader);*/
 
        // 启动爬虫
        spider.run();
        return datas;
    }
 
 
}