地质所 沉降监测网建设项目
chenhuan
2024-05-16 0fdd42e318f51f9e3c6581473416af1cca69877f
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
package com.javaweb.spider.resolver;
 
import com.javaweb.common.utils.StringUtils;
import com.javaweb.spider.config.SpiderConstants;
import com.javaweb.spider.domain.SpiderConfig;
import com.javaweb.spider.domain.SpiderField;
import com.javaweb.spider.domain.SpiderFiledRule;
import org.apache.commons.collections.CollectionUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.selector.Html;
 
import java.util.Comparator;
import java.util.List;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
 
/**
 * 解析处理普通的Html网页
 */
public class DefaultResolver implements Resolver {
 
    @Override
    public void process(Page page, SpiderConfig spiderConfig) {
        List<SpiderField> fields = spiderConfig.getFieldsList();
 
        Html pageHtml = page.getHtml();
        Document pageDoc =page.getHtml().getDocument();
        String link = page.getRequest().getUrl();
 
        if(!spiderConfig.getEntryUrlsList().contains(link) || Pattern.matches(spiderConfig.getTargetRegex(), link)) {
            //入口页面无需取值,入口页面只是用于发现目标url,除非入口页面匹配目标url的正则
            for (SpiderField field : fields) {
                String column=field.getField();//有可能取得同一个字段需要多个不同规则
                //先判断该字段是否有值,如果没有在继续处理,有了就不用处理了
                String checkVal = String.valueOf(page.getResultItems().getAll().get(column));
                if(StringUtils.isNotEmpty(checkVal)){
                    continue;
                }
                String type = field.getExtractType();
                if (StringUtils.isEmpty(type)||type.equals(SpiderConstants.FIELD_EXTRACT_TYPE_XPATH)) {
                    this.xpath_put(page, pageHtml, field);
                } else if (type.equals(SpiderConstants.FIELD_EXTRACT_TYPE_CSS)) {
                    this.css_put(page, pageDoc, field);
                } else if (type.equals(SpiderConstants.FIELD_EXTRACT_TYPE_CONSTANT)) {
                    this.constant_put(page,field);
                } else {
                }
            }
            page.putField("link", link);
        }
        if (StringUtils.isNotEmpty(spiderConfig.getTargetRegex())) {
            if(spiderConfig.getCascade()==1 || spiderConfig.getEntryUrlsList().contains(link)){
                //级联发现或者是入口URL才收集目标URL
                page.addTargetRequests(page.getHtml().links().regex(spiderConfig.getTargetRegex()).all());
            }
        }
    }
 
    /**
     * 处理爬取后原始的文本内容,比如替换截取等操作最终得到自己想要的字符串
     * @param sourceValue
     * @param field
     * @return
     */
    private String processValue(String sourceValue,SpiderField field){
        if(StringUtils.isEmpty(sourceValue) || CollectionUtils.isEmpty(field.getFieldRules())){
            return sourceValue;
        }
        List<SpiderFiledRule> rules = field.getFieldRules();
        rules=rules.stream().sorted(Comparator.comparing(SpiderFiledRule::getSort)).collect(Collectors.toList());
        for(SpiderFiledRule rule:rules){
            if(SpiderConstants.FIELD_PROCESS_TYPE_REPLACE.equals(rule.getProcessType())){
                sourceValue=sourceValue.replace(rule.getReplacereg(),StringUtils.isEmpty(rule.getReplacement())?"":rule.getReplacement());
            }else if(SpiderConstants.FIELD_PROCESS_TYPE_SUBSTRING_AFTER.equals(rule.getProcessType())){
                sourceValue=StringUtils.trim_end_exclu(sourceValue, rule.getSubstrTarget());
            }else if(SpiderConstants.FIELD_PROCESS_TYPE_SUBSTRING_BEFORE.equals(rule.getProcessType())){
                sourceValue=StringUtils.trim_before_exclu(sourceValue, rule.getSubstrTarget());
            }else{}
        }
        return sourceValue;
    }
    private void xpath_put(Page page, Html pageHtml,SpiderField field) {
        if (StringUtils.isNotEmpty(field.getExtractAttr())) {
            String value=pageHtml.xpath(field.getExtractAttr()).get();
            value= processValue(value,field);//处理替换
            page.putField(field.getField(), value);
        }
    }
    private void css_put(Page page,Document pageDoc,SpiderField field) {
        String[] indexArr=field.getExtractIndex().split(",");
        String resStr="";
        if(!"1".equals(field.getExtractAttrFlag())){
            //非根据属性名取值
            for(String ix:indexArr){
                if(StringUtils.isNotEmpty(ix)){
                    String tempRes="";
                    try{
                        tempRes=  pageDoc.select(field.getExtractBy()).get(Integer.valueOf(ix)).html();
                    }catch (Exception ex){}
                    if(StringUtils.isNotEmpty(tempRes)){
                        resStr +=tempRes;
                        resStr +=",";
                    }
                }
            }
        }else{
            //根据属性名取值
            for(String ix:indexArr) {
                if (StringUtils.isNotEmpty(ix)) {
                    String tempRes="";
                    try {
                        tempRes= getAttributeByElement(pageDoc.select(field.getExtractBy()).get(Integer.valueOf(ix)),field.getExtractAttr());
                    }catch (Exception ex){}
                    if(StringUtils.isNotEmpty(tempRes)){
                        resStr +=tempRes;
                        resStr +=",";
                    }
                }
            }
 
        }
        if(resStr.endsWith(",")){
            resStr=resStr.substring(0,resStr.length()-1);
        }
        //处理替换
        resStr= processValue(resStr,field);
        page.putField(field.getField(),resStr);
    }
    private void constant_put(Page page,SpiderField field) {
        page.putField(field.getField(),field.getConstantValue());
    }
 
    private String getAttributeByElement(Element e, String attrName){
        // 判断如果属性名是href或者src
        String res = "";
        if ("href".equals(attrName) || "src".equals(attrName)) {
            // 因为要获取他们绝对路径
            res = e.attr("abs:" + attrName);
        } else {
            //不是href或者src
            res = e.attr(attrName);
        }
        return res;
    }
}