package com.javaweb.spider.resolver; import com.javaweb.common.utils.StringUtils; import com.javaweb.spider.config.SpiderConstants; import com.javaweb.spider.domain.SpiderConfig; import com.javaweb.spider.domain.SpiderField; import com.javaweb.spider.domain.SpiderFiledRule; import org.apache.commons.collections.CollectionUtils; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; import us.codecraft.webmagic.Page; import us.codecraft.webmagic.selector.Html; import java.util.Comparator; import java.util.List; import java.util.regex.Pattern; import java.util.stream.Collectors; /** * 解析处理普通的Html网页 */ public class DefaultResolver implements Resolver { @Override public void process(Page page, SpiderConfig spiderConfig) { List fields = spiderConfig.getFieldsList(); Html pageHtml = page.getHtml(); Document pageDoc =page.getHtml().getDocument(); String link = page.getRequest().getUrl(); if(!spiderConfig.getEntryUrlsList().contains(link) || Pattern.matches(spiderConfig.getTargetRegex(), link)) { //入口页面无需取值,入口页面只是用于发现目标url,除非入口页面匹配目标url的正则 for (SpiderField field : fields) { String column=field.getField();//有可能取得同一个字段需要多个不同规则 //先判断该字段是否有值,如果没有在继续处理,有了就不用处理了 String checkVal = String.valueOf(page.getResultItems().getAll().get(column)); if(StringUtils.isNotEmpty(checkVal)){ continue; } String type = field.getExtractType(); if (StringUtils.isEmpty(type)||type.equals(SpiderConstants.FIELD_EXTRACT_TYPE_XPATH)) { this.xpath_put(page, pageHtml, field); } else if (type.equals(SpiderConstants.FIELD_EXTRACT_TYPE_CSS)) { this.css_put(page, pageDoc, field); } else if (type.equals(SpiderConstants.FIELD_EXTRACT_TYPE_CONSTANT)) { this.constant_put(page,field); } else { } } page.putField("link", link); } if (StringUtils.isNotEmpty(spiderConfig.getTargetRegex())) { if(spiderConfig.getCascade()==1 || spiderConfig.getEntryUrlsList().contains(link)){ //级联发现或者是入口URL才收集目标URL page.addTargetRequests(page.getHtml().links().regex(spiderConfig.getTargetRegex()).all()); } } } /** * 处理爬取后原始的文本内容,比如替换截取等操作最终得到自己想要的字符串 * @param sourceValue * @param field * @return */ private String processValue(String sourceValue,SpiderField field){ if(StringUtils.isEmpty(sourceValue) || CollectionUtils.isEmpty(field.getFieldRules())){ return sourceValue; } List rules = field.getFieldRules(); rules=rules.stream().sorted(Comparator.comparing(SpiderFiledRule::getSort)).collect(Collectors.toList()); for(SpiderFiledRule rule:rules){ if(SpiderConstants.FIELD_PROCESS_TYPE_REPLACE.equals(rule.getProcessType())){ sourceValue=sourceValue.replace(rule.getReplacereg(),StringUtils.isEmpty(rule.getReplacement())?"":rule.getReplacement()); }else if(SpiderConstants.FIELD_PROCESS_TYPE_SUBSTRING_AFTER.equals(rule.getProcessType())){ sourceValue=StringUtils.trim_end_exclu(sourceValue, rule.getSubstrTarget()); }else if(SpiderConstants.FIELD_PROCESS_TYPE_SUBSTRING_BEFORE.equals(rule.getProcessType())){ sourceValue=StringUtils.trim_before_exclu(sourceValue, rule.getSubstrTarget()); }else{} } return sourceValue; } private void xpath_put(Page page, Html pageHtml,SpiderField field) { if (StringUtils.isNotEmpty(field.getExtractAttr())) { String value=pageHtml.xpath(field.getExtractAttr()).get(); value= processValue(value,field);//处理替换 page.putField(field.getField(), value); } } private void css_put(Page page,Document pageDoc,SpiderField field) { String[] indexArr=field.getExtractIndex().split(","); String resStr=""; if(!"1".equals(field.getExtractAttrFlag())){ //非根据属性名取值 for(String ix:indexArr){ if(StringUtils.isNotEmpty(ix)){ String tempRes=""; try{ tempRes= pageDoc.select(field.getExtractBy()).get(Integer.valueOf(ix)).html(); }catch (Exception ex){} if(StringUtils.isNotEmpty(tempRes)){ resStr +=tempRes; resStr +=","; } } } }else{ //根据属性名取值 for(String ix:indexArr) { if (StringUtils.isNotEmpty(ix)) { String tempRes=""; try { tempRes= getAttributeByElement(pageDoc.select(field.getExtractBy()).get(Integer.valueOf(ix)),field.getExtractAttr()); }catch (Exception ex){} if(StringUtils.isNotEmpty(tempRes)){ resStr +=tempRes; resStr +=","; } } } } if(resStr.endsWith(",")){ resStr=resStr.substring(0,resStr.length()-1); } //处理替换 resStr= processValue(resStr,field); page.putField(field.getField(),resStr); } private void constant_put(Page page,SpiderField field) { page.putField(field.getField(),field.getConstantValue()); } private String getAttributeByElement(Element e, String attrName){ // 判断如果属性名是href或者src String res = ""; if ("href".equals(attrName) || "src".equals(attrName)) { // 因为要获取他们绝对路径 res = e.attr("abs:" + attrName); } else { //不是href或者src res = e.attr(attrName); } return res; } }