package com.javaweb.spider.fast; import com.alibaba.fastjson.JSONArray; import com.alibaba.fastjson.JSONObject; import com.javaweb.spider.domain.SpiderConfig; import com.javaweb.spider.domain.SpiderField; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.collections.MapUtils; import org.apache.commons.lang3.StringUtils; import org.assertj.core.util.Lists; import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; /** * 通过读取json配置文件快速构建爬虫任务配置信息 */ public class FastConfigContext { public static SpiderConfig parseConfig(FastConfigEnum fastConfigEnum) { return parseConfig(fastConfigEnum.getCode()); } /** * 解析爬虫配置 * * @param code 爬虫代码 * @return SpiderConfig */ public static SpiderConfig parseConfig(String code) { String platformConfig = FastConfigTemplate.getConfig(code); JSONObject jsonObject = JSONObject.parseObject(platformConfig); String br = "\r\n"; Set> entries = jsonObject.entrySet(); List spiderFields= Lists.newArrayList(); String cookieStr =""; for (Map.Entry entry : entries) { if ("header".equals(entry.getKey())) { List headers = JSONArray.parseArray(String.valueOf(entry.getValue()), String.class); entry.setValue(String.join(br, headers)); } if ("entryUrls".equals(entry.getKey())) { List urls = JSONArray.parseArray(String.valueOf(entry.getValue()), String.class); entry.setValue(String.join(",", urls)); } if ("fields".equals(entry.getKey())) { List fields = JSONArray.parseArray(String.valueOf(entry.getValue()), String.class); SpiderField spiderField=null; for(String f:fields){ if(f.contains("#")){ String[] arr=f.split("#"); if(arr!=null&&arr.length==3){ spiderField=new SpiderField(); spiderField.setFieldName(arr[0].trim()); spiderField.setField(arr[1].trim()); spiderField.setExtractBy(arr[2].trim()); spiderFields.add(spiderField); } } } } } SpiderConfig spiderConfig=JSONObject.toJavaObject(jsonObject, SpiderConfig.class); spiderConfig.setSpiderCode(code); spiderConfig.setFieldsList(spiderFields); return spiderConfig; } /** * 重新解析配置模板, 将用户id替换为真实的id * * @param config config * @return config */ public static SpiderConfig replaceConfigUid(SpiderConfig config) { if (null == config) { return null; } String uid = config.getUserId(); if (StringUtils.isEmpty(uid)) { return config; } String domain = config.getDomain(); if (StringUtils.isNotEmpty(domain)) { config.setDomain(domain.replace("{userId}", uid)); } String targetLinksRegex = config.getTargetRegex(); if (StringUtils.isNotEmpty(targetLinksRegex)) { config.setTargetRegex(targetLinksRegex.replace("{userId}", uid)); } //注意入口变量有2个 entryUrlsList 和 entryUrls List entryUrlsList = config.getEntryUrlsList(); if (CollectionUtils.isNotEmpty(entryUrlsList)) { List newEntryUrls = new ArrayList<>(); for (String entryUrl : entryUrlsList) { newEntryUrls.add(entryUrl.replace("{userId}", uid)); } config.setEntryUrlsList(newEntryUrls); } //entryUrls String entryUrls=config.getEntryUrls(); config.setEntryUrls(entryUrls.replace("{userId}", uid)); Map header = config.getHeaders(); if (MapUtils.isNotEmpty(header)) { Set> entries = header.entrySet(); for (Map.Entry entry : entries) { String key = entry.getKey(); String value = entry.getValue(); header.put(key, value.replace("{userId}", uid)); } } return config; } }