地质所 沉降监测网建设项目
zmk
2024-05-15 9e3afc6d0fa514f986d3fea40fa23124e6fb5070
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
package com.javaweb.spider.backend;
 
import com.javaweb.common.core.domain.AjaxResult;
import com.javaweb.common.core.domain.ICallBack;
import com.javaweb.common.utils.spring.SpringUtils;
import com.javaweb.spider.config.SpiderConstants;
import com.javaweb.spider.domain.SpiderConfig;
import com.javaweb.spider.domain.SpiderField;
import com.javaweb.spider.domain.SpiderFiledRule;
import com.javaweb.spider.domain.SpiderMission;
import com.javaweb.spider.mapper.SpiderFieldMapper;
import com.javaweb.spider.mapper.SpiderFiledRuleMapper;
import com.javaweb.spider.processer.AbstractProcessor;
import com.javaweb.spider.processer.DefalutProcessor;
import com.javaweb.spider.service.ISpiderConfigService;
import com.javaweb.spider.service.ISpiderMissionService;
import org.apache.commons.lang3.StringUtils;
import org.assertj.core.util.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Service;
 
import java.util.*;
import java.util.concurrent.CopyOnWriteArrayList;
 
/**
 * 任务爬虫入口类,通过传入爬虫任务id,从数据库中查询配置参数
 * 该种方式的爬虫相比快速配置的FastSpiderBackendService来比,它可以设置字段值处理规则。
 */
public class SpiderBackendService extends Thread {
    protected final Logger logger = LoggerFactory.getLogger(SpiderBackendService.class);
    private String missionId;
    private ICallBack callBack;
 
 
    private ISpiderMissionService spiderMissionService= SpringUtils.getBean(ISpiderMissionService.class);//爬虫任务
 
    private ISpiderConfigService spiderConfigService=SpringUtils.getBean(ISpiderConfigService.class);//爬虫配置
 
    private SpiderFieldMapper spiderFieldMapper=SpringUtils.getBean(SpiderFieldMapper.class);//爬虫字段
 
    private SpiderFiledRuleMapper spiderFiledRuleMapper=SpringUtils.getBean(SpiderFiledRuleMapper.class);//爬虫字段值处理规则
 
    public SpiderBackendService(String missionId){
        this.missionId=missionId;
    }
 
    public SpiderBackendService(String missionId,ICallBack callBack){
        this.missionId=missionId;
        this.callBack=callBack;
    }
    @Override
    public void run() {
        SpiderMission mission=spiderMissionService.selectSpiderMissionById(Long.valueOf(missionId));
        if(mission!=null){
            if(SpiderConstants.SPIDER_MISSION_STATUS_RUNNING.equals(mission.getStatus())){
                logger.warn(">>>>>>>>>>>>>>>爬虫任务["+missionId+"]已经在运行!本次不在执行!<<<<<<<<<");
                return;
            }
            //查询爬虫配置
            Long configId=mission.getSpiderConfigId();
            SpiderConfig config = spiderConfigService.selectSpiderConfigById(configId);
            //查询字段配置
            SpiderField queryForm=new SpiderField();
            queryForm.setConfigId(config.getId());
            List<SpiderField> fields = spiderFieldMapper.selectSpiderFieldList(queryForm);
            config.setFieldsList(fields);
            //设置字段值处理规则
            for(SpiderField field:fields){
                SpiderFiledRule ruleQueryForm=new SpiderFiledRule();
                ruleQueryForm.setFieldId(field.getFieldId().toString());
                List<SpiderFiledRule> rules = spiderFiledRuleMapper.selectSpiderFiledRuleList(ruleQueryForm);
                field.setFieldRules(rules);
            }
            //设置入口地址
            String entryUrls=mission.getEntryUrls();
            List<String> urls= Lists.newArrayList();
            if(StringUtils.isNotEmpty(entryUrls)){
                String[] arr=entryUrls.split(",");
                for(String s:arr){
                    if(StringUtils.isNotEmpty(s)&&isURL(s)){
                        urls.add(s);
                    }
                    if(StringUtils.isNotEmpty(s)&&!isURL(s)){
                        logger.warn(">>>>>>>>>>>>>>>配置的url:["+s+"]不是一个有效的url!");
                    }
                }
            }
            config.setEntryUrlsList(urls);
 
            //设置退出方式
            config.setExitWay(mission.getExitWay());
            Long c= mission.getExitWayCount();
            if(c==null){
                c=0L;
            }
            config.setCount(Integer.valueOf(c.toString()));
            if(StringUtils.isNotEmpty(mission.getHeaderStr())){
                config.setHeader(mission.getHeaderStr());
            }
            if(StringUtils.isNotEmpty(mission.getCookieStr())){
                config.setCookie(mission.getCookieStr());
            }
            AbstractProcessor processor=new DefalutProcessor(config,missionId.toString());
 
            mission.setStatus(SpiderConstants.SPIDER_MISSION_STATUS_RUNNING);
            mission.setStartTime(new Date());
            spiderMissionService.updateSpiderMission(mission);
 
            CopyOnWriteArrayList<LinkedHashMap<String, String>> datas = processor.execute();//执行爬虫
            mission.setEndTime(new Date());
            mission.setStatus(SpiderConstants.SPIDER_MISSION_STATUS_DONE);
            mission.setSuccessNum(Long.valueOf(datas.size()));
            Long count=(mission.getEndTime().getTime()-mission.getStartTime().getTime())/1000;
            mission.setTimeCost(count.toString());
            spiderMissionService.updateSpiderMission(mission);
            if(callBack!=null){
                Map<String,CopyOnWriteArrayList<LinkedHashMap<String, String>>> rmap=new HashMap();
                rmap.put("datas",datas);
                callBack.setParams(rmap);
                callBack.onSuccess();
            }
        }
    }
 
    private static boolean isURL(String str){
        str = str.toLowerCase();
        String regex = "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]";
        return str.matches(regex);
    }
}