地质所 沉降监测网建设项目
zmk
2024-05-15 9e3afc6d0fa514f986d3fea40fa23124e6fb5070
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
package com.javaweb.spider.fast;
 
import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;
import com.javaweb.spider.domain.SpiderConfig;
import com.javaweb.spider.domain.SpiderField;
import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.collections.MapUtils;
import org.apache.commons.lang3.StringUtils;
import org.assertj.core.util.Lists;
 
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Set;
 
/**
 * 通过读取json配置文件快速构建爬虫任务配置信息
 */
public class FastConfigContext {
 
    public static SpiderConfig parseConfig(FastConfigEnum fastConfigEnum) {
        return  parseConfig(fastConfigEnum.getCode());
    }
 
    /**
     * 解析爬虫配置
     *
     * @param code 爬虫代码
     * @return SpiderConfig
     */
    public static SpiderConfig parseConfig(String code) {
        String platformConfig = FastConfigTemplate.getConfig(code);
        JSONObject jsonObject = JSONObject.parseObject(platformConfig);
        String br = "\r\n";
        Set<Map.Entry<String, Object>> entries = jsonObject.entrySet();
        List<SpiderField> spiderFields= Lists.newArrayList();
        String cookieStr ="";
        for (Map.Entry<String, Object> entry : entries) {
 
            if ("header".equals(entry.getKey())) {
                List<String> headers = JSONArray.parseArray(String.valueOf(entry.getValue()), String.class);
                entry.setValue(String.join(br, headers));
            }
            if ("entryUrls".equals(entry.getKey())) {
                List<String> urls = JSONArray.parseArray(String.valueOf(entry.getValue()), String.class);
                entry.setValue(String.join(",", urls));
            }
 
            if ("fields".equals(entry.getKey())) {
                List<String> fields = JSONArray.parseArray(String.valueOf(entry.getValue()), String.class);
                SpiderField spiderField=null;
                for(String f:fields){
                    if(f.contains("#")){
                        String[] arr=f.split("#");
                        if(arr!=null&&arr.length==3){
                            spiderField=new SpiderField();
                            spiderField.setFieldName(arr[0].trim());
                            spiderField.setField(arr[1].trim());
                            spiderField.setExtractBy(arr[2].trim());
                            spiderFields.add(spiderField);
                        }
                    }
                }
            }
        }
        SpiderConfig spiderConfig=JSONObject.toJavaObject(jsonObject, SpiderConfig.class);
        spiderConfig.setSpiderCode(code);
        spiderConfig.setFieldsList(spiderFields);
        return spiderConfig;
    }
 
    /**
     * 重新解析配置模板, 将用户id替换为真实的id
     *
     * @param config config
     * @return config
     */
    public static SpiderConfig replaceConfigUid(SpiderConfig config) {
        if (null == config) {
            return null;
        }
        String uid = config.getUserId();
        if (StringUtils.isEmpty(uid)) {
            return config;
        }
        String domain = config.getDomain();
        if (StringUtils.isNotEmpty(domain)) {
            config.setDomain(domain.replace("{userId}", uid));
        }
        String targetLinksRegex = config.getTargetRegex();
        if (StringUtils.isNotEmpty(targetLinksRegex)) {
            config.setTargetRegex(targetLinksRegex.replace("{userId}", uid));
        }
        //注意入口变量有2个  entryUrlsList 和 entryUrls
        List<String> entryUrlsList = config.getEntryUrlsList();
        if (CollectionUtils.isNotEmpty(entryUrlsList)) {
            List<String> newEntryUrls = new ArrayList<>();
            for (String entryUrl : entryUrlsList) {
                newEntryUrls.add(entryUrl.replace("{userId}", uid));
            }
            config.setEntryUrlsList(newEntryUrls);
        }
        //entryUrls
        String entryUrls=config.getEntryUrls();
        config.setEntryUrls(entryUrls.replace("{userId}", uid));
 
        Map<String, String> header = config.getHeaders();
        if (MapUtils.isNotEmpty(header)) {
            Set<Map.Entry<String, String>> entries = header.entrySet();
            for (Map.Entry<String, String> entry : entries) {
                String key = entry.getKey();
                String value = entry.getValue();
                header.put(key, value.replace("{userId}", uid));
            }
        }
        return config;
    }
}