package com.javaweb.spider.domain; import com.javaweb.spider.config.SpiderConstants; import com.javaweb.spider.fast.FastConfigContext; import com.javaweb.spider.util.FackUserAgentUtil; import lombok.Data; import org.apache.commons.collections.CollectionUtils; import org.apache.commons.lang3.StringUtils; import com.javaweb.common.annotation.Excel; import com.javaweb.common.core.domain.BaseEntity; import org.assertj.core.util.Lists; import us.codecraft.webmagic.pipeline.Pipeline; import us.codecraft.webmagic.proxy.Proxy; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; /** * 爬虫配置对象 spider_config * * @author wujiyue * @date 2019-11-11 */ @Data public class SpiderConfig extends BaseEntity { private static final long serialVersionUID = 1L; /** 爬虫配置ID */ private Long id; /** 爬虫编码 */ @Excel(name = "爬虫编码") private String spiderCode; /** 爬虫名称 */ @Excel(name = "爬虫名称") private String spiderName; /** 入口地址 */ private String entryUrls; /** 目标URL正则 */ private String targetRegex; /** 存储的表名 */ @Excel(name = "存储的表名") private String tableName; /** 网站根域名 */ @Excel(name = "网站根域名") private String domain; /** 字符集 */ @Excel(name = "字符集") private String charset = "utf8"; /** 睡眠时间(ms) */ @Excel(name = "睡眠时间(ms)") private Long sleepTime= 1000L; /** 重试次数 */ @Excel(name = "重试次数") private Integer retryTimes = 2; /** 线程数量 */ @Excel(name = "线程数量") private Long threadCount = 1L; /** 使用代理 */ @Excel(name = "使用代理") private Integer useProxy =0 ; @Excel(name = "打印日志") private Integer showLog=1; public Integer getShowLog(){ return this.showLog; } /** * 退出方式{DURATION:爬虫持续的时间,URL_COUNT:抓取到的url数量} */ private String exitWay = ExitWayEnum.URL_COUNT.toString(); /** * 对应退出方式,当exitWay = URL_COUNT时,该值表示url数量,当exitWay = DURATION时,该值表示爬虫持续的时间 */ private int count; private Integer cascade=0;//默认0表示支持入口URL发现目标URL,而不从目标URL级联收集URL public Integer getCascade(){ return this.cascade; } private List cookies = new ArrayList<>(); private Map headers = new HashMap<>(); private String ua = FackUserAgentUtil.getUserAgent(); /** 入口地址集合 */ private List entryUrlsList= Lists.newArrayList(); private List fieldsList=Lists.newArrayList(); public List getFieldsList(){ return this.fieldsList; } public String getTargetRegex(){ return this.targetRegex; } public Integer getUseProxy(){ return this.useProxy; } public List getProxyList(){ return this.proxyList; } private List pipelineList=Lists.newArrayList(); public List getPipelineList(){ return this.pipelineList; } /** * 是否转存图片,当选择true时会自动过滤原文中的img链接,调用端可选择将图片下载后替换掉原来的图片 */ private boolean convertImg = false; private List proxyList = new ArrayList<>(); private String userId;//扩展字段,无数据库字段对应。对应于爬取某个人的博客的作者uid public String getUserId(){ return this.userId; } public String getDomain(){ return this.domain; } public Long getThreadCount(){ return this.threadCount; } public String getEntryUrls(){ return this.entryUrls; } public static SpiderConfig create(){ return new SpiderConfig(); } public List getEntryUrlsList(){ return this.entryUrlsList; } public String getSpiderCode(){ return this.spiderCode; } public String getSpiderName(){ return this.spiderName; } public String getExitWay(){ return this.exitWay; } public void setExitWay(String exitWay) { this.exitWay = exitWay; } public SpiderConfig setExitWay(ExitWayEnum exitWay) { this.exitWay = exitWay.toString(); this.count = exitWay.getDefaultCount(); return this; } public SpiderConfig setConvertImg(boolean convertImg) { this.convertImg = convertImg; return this; } public int getCount(){ return this.count; } public SpiderConfig setCount(int count) { this.count = count; return this; } public Map getHeaders(){ return this.headers; } public SpiderConfig setHeader(String key, String value) { Map headers = this.getHeaders(); headers.put(key, value); return this; } public SpiderConfig setHeader(String headersStr) { if (StringUtils.isNotEmpty(headersStr)) { String[] headerArr = headersStr.split("\r\n"); for (String s : headerArr) { String[] header = s.split("="); setHeader(header[0], header[1]); } } return this; } public List getCookies(){ return this.cookies; } public SpiderConfig setCookie(String domain, String key, String value) { List cookies = this.getCookies(); cookies.add(new Cookie(domain, key, value)); return this; } public SpiderConfig setCookie(String cookiesStr) { if (StringUtils.isNotEmpty(cookiesStr)) { List cookies = this.getCookies(); String[] cookieArr = cookiesStr.split(";"); for (String aCookieArr : cookieArr) { String[] cookieNode = aCookieArr.split("="); if (cookieNode.length <= 1) { continue; } cookies.add(new Cookie(cookieNode[0].trim(), cookieNode[1].trim())); } } return this; } private void addProxy(Proxy proxy) { if (this.useProxy == 0 || null == this.useProxy || null == proxy) { return; } proxyList.add(proxy); } public SpiderConfig setProxy(String proxyStr) { if (this.useProxy == 0 || null == this.useProxy || proxyStr == null) { return this; } String[] proxyArr = proxyStr.split(","); for (String s : proxyArr) { String[] proxy = s.split(":"); if (proxy.length == 2) { this.addProxy(new Proxy(proxy[0], Integer.parseInt(proxy[1]))); } else if (proxy.length == 4) { this.addProxy(new Proxy(proxy[0], Integer.parseInt(proxy[1]), proxy[2], proxy[3])); } } return this; } public SpiderConfig setEntryUrls(String entryUrls) { this.entryUrls = entryUrls; if(StringUtils.isNotEmpty(entryUrls)){ String[] arr=entryUrls.split(","); for(String s:arr){ if(StringUtils.isNotEmpty(s)){ entryUrlsList.add(s); } } } return this; } public SpiderConfig addEntryUrl(String entryUrl) { this.entryUrlsList.add(entryUrl); return this; } public void setEntryUrlsList(List entryUrlsList) { this.entryUrlsList = entryUrlsList; } public SpiderConfig setFieldsList(List fieldsList) { if(CollectionUtils.isEmpty(fieldsList)){ return this; } this.fieldsList.addAll(fieldsList); return this; } /** * 增加一个爬取字段 * @param field * @return */ public SpiderConfig addField(SpiderField field) { if(field==null||StringUtils.isEmpty(field.getField())){ return this; } this.fieldsList.add(field); return this; } /** * 增加一个数据管道 * @param pipeline * @return */ public SpiderConfig addPipeline(Pipeline pipeline) { if(pipeline==null){ return this; } this.pipelineList.add(pipeline); return this; } /** * 增加一个xpath提取规则字段 * @param field * @param fieldName * @param xpath * @return */ public SpiderConfig addField(String field,String fieldName,String xpath) { SpiderField spiderField=new SpiderField(); spiderField.setField(field); spiderField.setFieldName(fieldName); spiderField.setExtractType(SpiderConstants.FIELD_EXTRACT_TYPE_XPATH); spiderField.setExtractBy(xpath); this.fieldsList.add(spiderField); return this; } public SpiderConfig setTargetRegex(String targetRegex) { this.targetRegex = targetRegex; return this; } public SpiderConfig setThreadCount(Long threadCount) { this.threadCount = threadCount; return this; } public SpiderConfig setShowLog(Integer showLog) { this.showLog = showLog; return this; } public SpiderConfig setSleepTime(Long sleepTime) { this.sleepTime = sleepTime; return this; } public SpiderConfig setDomain(String domain) { this.domain = domain; return this; } public SpiderConfig setCharset(String charset) { this.charset = charset; return this; } public SpiderConfig setRetryTimes(Integer retryTimes) { this.retryTimes = retryTimes; return this; } public SpiderConfig setUseProxy(Integer useProxy) { this.useProxy = useProxy; return this; } public SpiderConfig setCascade(Integer cascade) { this.cascade = cascade; return this; } /** * 爬取某个人博客时候博主的用户id * @param userId * @return */ public SpiderConfig setUserId(String userId) { this.userId = userId; return FastConfigContext.replaceConfigUid(this); } public Long getId() { return id; } public void setId(Long id) { this.id = id; } public void setSpiderCode(String spiderCode) { this.spiderCode = spiderCode; } public void setSpiderName(String spiderName) { this.spiderName = spiderName; } public String getTableName() { return tableName; } public void setTableName(String tableName) { this.tableName = tableName; } public void setCookies(List cookies) { this.cookies = cookies; } public void setHeaders(Map headers) { this.headers = headers; } public String getUa() { return ua; } public void setUa(String ua) { this.ua = ua; } public void setPipelineList(List pipelineList) { this.pipelineList = pipelineList; } public void setProxyList(List proxyList) { this.proxyList = proxyList; } public String getCharset() { return charset; } public Long getSleepTime() { return sleepTime; } public Integer getRetryTimes() { return retryTimes; } public boolean isConvertImg() { return convertImg; } }