package com.javaweb.spider.domain;
|
|
import com.javaweb.spider.config.SpiderConstants;
|
import com.javaweb.spider.fast.FastConfigContext;
|
import com.javaweb.spider.util.FackUserAgentUtil;
|
import lombok.Data;
|
import org.apache.commons.collections.CollectionUtils;
|
import org.apache.commons.lang3.StringUtils;
|
import com.javaweb.common.annotation.Excel;
|
import com.javaweb.common.core.domain.BaseEntity;
|
import org.assertj.core.util.Lists;
|
import us.codecraft.webmagic.pipeline.Pipeline;
|
import us.codecraft.webmagic.proxy.Proxy;
|
|
import java.util.ArrayList;
|
import java.util.HashMap;
|
import java.util.List;
|
import java.util.Map;
|
|
/**
|
* 爬虫配置对象 spider_config
|
*
|
* @author wujiyue
|
* @date 2019-11-11
|
*/
|
@Data
|
public class SpiderConfig extends BaseEntity
|
{
|
private static final long serialVersionUID = 1L;
|
|
/** 爬虫配置ID */
|
private Long id;
|
/** 爬虫编码 */
|
@Excel(name = "爬虫编码")
|
private String spiderCode;
|
/** 爬虫名称 */
|
@Excel(name = "爬虫名称")
|
private String spiderName;
|
|
/** 入口地址 */
|
private String entryUrls;
|
|
/** 目标URL正则 */
|
private String targetRegex;
|
|
/** 存储的表名 */
|
@Excel(name = "存储的表名")
|
private String tableName;
|
|
/** 网站根域名 */
|
@Excel(name = "网站根域名")
|
private String domain;
|
|
|
/** 字符集 */
|
@Excel(name = "字符集")
|
private String charset = "utf8";
|
|
/** 睡眠时间(ms) */
|
@Excel(name = "睡眠时间(ms)")
|
private Long sleepTime= 1000L;
|
|
/** 重试次数 */
|
@Excel(name = "重试次数")
|
private Integer retryTimes = 2;
|
|
/** 线程数量 */
|
@Excel(name = "线程数量")
|
private Long threadCount = 1L;
|
|
/** 使用代理 */
|
@Excel(name = "使用代理")
|
private Integer useProxy =0 ;
|
@Excel(name = "打印日志")
|
private Integer showLog=1;
|
public Integer getShowLog(){
|
return this.showLog;
|
}
|
/**
|
* 退出方式{DURATION:爬虫持续的时间,URL_COUNT:抓取到的url数量}
|
*/
|
private String exitWay = ExitWayEnum.URL_COUNT.toString();
|
/**
|
* 对应退出方式,当exitWay = URL_COUNT时,该值表示url数量,当exitWay = DURATION时,该值表示爬虫持续的时间
|
*/
|
private int count;
|
|
private Integer cascade=0;//默认0表示支持入口URL发现目标URL,而不从目标URL级联收集URL
|
|
public Integer getCascade(){
|
return this.cascade;
|
}
|
|
private List<Cookie> cookies = new ArrayList<>();
|
private Map<String, String> headers = new HashMap<>();
|
private String ua = FackUserAgentUtil.getUserAgent();
|
/** 入口地址集合 */
|
private List<String> entryUrlsList= Lists.newArrayList();
|
|
private List<SpiderField> fieldsList=Lists.newArrayList();
|
|
public List<SpiderField> getFieldsList(){
|
return this.fieldsList;
|
}
|
|
public String getTargetRegex(){
|
return this.targetRegex;
|
}
|
|
public Integer getUseProxy(){
|
return this.useProxy;
|
}
|
|
public List<Proxy> getProxyList(){
|
return this.proxyList;
|
}
|
|
private List<Pipeline> pipelineList=Lists.newArrayList();
|
|
public List<Pipeline> getPipelineList(){
|
return this.pipelineList;
|
}
|
|
/**
|
* 是否转存图片,当选择true时会自动过滤原文中的img链接,调用端可选择将图片下载后替换掉原来的图片
|
*/
|
private boolean convertImg = false;
|
private List<Proxy> proxyList = new ArrayList<>();
|
|
private String userId;//扩展字段,无数据库字段对应。对应于爬取某个人的博客的作者uid
|
|
public String getUserId(){
|
return this.userId;
|
}
|
|
public String getDomain(){
|
return this.domain;
|
}
|
|
public Long getThreadCount(){
|
return this.threadCount;
|
}
|
|
public String getEntryUrls(){
|
return this.entryUrls;
|
}
|
|
public static SpiderConfig create(){
|
return new SpiderConfig();
|
}
|
|
public List<String> getEntryUrlsList(){
|
return this.entryUrlsList;
|
}
|
|
public String getSpiderCode(){
|
return this.spiderCode;
|
}
|
|
public String getSpiderName(){
|
return this.spiderName;
|
}
|
|
public String getExitWay(){
|
return this.exitWay;
|
}
|
|
public void setExitWay(String exitWay) {
|
this.exitWay = exitWay;
|
}
|
|
public SpiderConfig setExitWay(ExitWayEnum exitWay) {
|
this.exitWay = exitWay.toString();
|
this.count = exitWay.getDefaultCount();
|
return this;
|
}
|
|
|
public SpiderConfig setConvertImg(boolean convertImg) {
|
this.convertImg = convertImg;
|
return this;
|
}
|
|
public int getCount(){
|
return this.count;
|
}
|
|
public SpiderConfig setCount(int count) {
|
this.count = count;
|
return this;
|
}
|
|
public Map<String, String> getHeaders(){
|
return this.headers;
|
}
|
|
public SpiderConfig setHeader(String key, String value) {
|
Map<String, String> headers = this.getHeaders();
|
headers.put(key, value);
|
return this;
|
}
|
|
public SpiderConfig setHeader(String headersStr) {
|
if (StringUtils.isNotEmpty(headersStr)) {
|
String[] headerArr = headersStr.split("\r\n");
|
for (String s : headerArr) {
|
String[] header = s.split("=");
|
setHeader(header[0], header[1]);
|
}
|
}
|
return this;
|
}
|
|
public List<Cookie> getCookies(){
|
return this.cookies;
|
}
|
|
public SpiderConfig setCookie(String domain, String key, String value) {
|
List<Cookie> cookies = this.getCookies();
|
cookies.add(new Cookie(domain, key, value));
|
return this;
|
}
|
|
public SpiderConfig setCookie(String cookiesStr) {
|
if (StringUtils.isNotEmpty(cookiesStr)) {
|
List<Cookie> cookies = this.getCookies();
|
String[] cookieArr = cookiesStr.split(";");
|
for (String aCookieArr : cookieArr) {
|
String[] cookieNode = aCookieArr.split("=");
|
if (cookieNode.length <= 1) {
|
continue;
|
}
|
cookies.add(new Cookie(cookieNode[0].trim(), cookieNode[1].trim()));
|
}
|
}
|
return this;
|
}
|
|
|
private void addProxy(Proxy proxy) {
|
if (this.useProxy == 0 || null == this.useProxy || null == proxy) {
|
return;
|
}
|
proxyList.add(proxy);
|
}
|
|
public SpiderConfig setProxy(String proxyStr) {
|
if (this.useProxy == 0 || null == this.useProxy || proxyStr == null) {
|
return this;
|
}
|
String[] proxyArr = proxyStr.split(",");
|
for (String s : proxyArr) {
|
String[] proxy = s.split(":");
|
if (proxy.length == 2) {
|
this.addProxy(new Proxy(proxy[0], Integer.parseInt(proxy[1])));
|
} else if (proxy.length == 4) {
|
this.addProxy(new Proxy(proxy[0], Integer.parseInt(proxy[1]), proxy[2], proxy[3]));
|
}
|
}
|
return this;
|
}
|
|
|
public SpiderConfig setEntryUrls(String entryUrls) {
|
this.entryUrls = entryUrls;
|
if(StringUtils.isNotEmpty(entryUrls)){
|
String[] arr=entryUrls.split(",");
|
for(String s:arr){
|
if(StringUtils.isNotEmpty(s)){
|
entryUrlsList.add(s);
|
}
|
}
|
}
|
return this;
|
}
|
public SpiderConfig addEntryUrl(String entryUrl) {
|
this.entryUrlsList.add(entryUrl);
|
return this;
|
}
|
public void setEntryUrlsList(List<String> entryUrlsList) {
|
this.entryUrlsList = entryUrlsList;
|
}
|
|
public SpiderConfig setFieldsList(List<SpiderField> fieldsList) {
|
if(CollectionUtils.isEmpty(fieldsList)){
|
return this;
|
}
|
this.fieldsList.addAll(fieldsList);
|
return this;
|
}
|
|
/**
|
* 增加一个爬取字段
|
* @param field
|
* @return
|
*/
|
public SpiderConfig addField(SpiderField field) {
|
if(field==null||StringUtils.isEmpty(field.getField())){
|
return this;
|
}
|
this.fieldsList.add(field);
|
return this;
|
}
|
/**
|
* 增加一个数据管道
|
* @param pipeline
|
* @return
|
*/
|
public SpiderConfig addPipeline(Pipeline pipeline) {
|
if(pipeline==null){
|
return this;
|
}
|
this.pipelineList.add(pipeline);
|
return this;
|
}
|
|
/**
|
* 增加一个xpath提取规则字段
|
* @param field
|
* @param fieldName
|
* @param xpath
|
* @return
|
*/
|
public SpiderConfig addField(String field,String fieldName,String xpath) {
|
SpiderField spiderField=new SpiderField();
|
spiderField.setField(field);
|
spiderField.setFieldName(fieldName);
|
spiderField.setExtractType(SpiderConstants.FIELD_EXTRACT_TYPE_XPATH);
|
spiderField.setExtractBy(xpath);
|
this.fieldsList.add(spiderField);
|
return this;
|
}
|
|
public SpiderConfig setTargetRegex(String targetRegex) {
|
this.targetRegex = targetRegex;
|
return this;
|
}
|
|
public SpiderConfig setThreadCount(Long threadCount) {
|
this.threadCount = threadCount;
|
return this;
|
}
|
|
public SpiderConfig setShowLog(Integer showLog) {
|
this.showLog = showLog;
|
return this;
|
}
|
|
public SpiderConfig setSleepTime(Long sleepTime) {
|
this.sleepTime = sleepTime;
|
return this;
|
}
|
|
public SpiderConfig setDomain(String domain) {
|
this.domain = domain;
|
return this;
|
}
|
|
public SpiderConfig setCharset(String charset) {
|
this.charset = charset;
|
return this;
|
}
|
|
public SpiderConfig setRetryTimes(Integer retryTimes) {
|
this.retryTimes = retryTimes;
|
return this;
|
}
|
|
public SpiderConfig setUseProxy(Integer useProxy) {
|
this.useProxy = useProxy;
|
return this;
|
}
|
|
public SpiderConfig setCascade(Integer cascade) {
|
this.cascade = cascade;
|
return this;
|
}
|
|
/**
|
* 爬取某个人博客时候博主的用户id
|
* @param userId
|
* @return
|
*/
|
public SpiderConfig setUserId(String userId) {
|
this.userId = userId;
|
return FastConfigContext.replaceConfigUid(this);
|
}
|
public Long getId() {
|
return id;
|
}
|
public void setId(Long id) {
|
this.id = id;
|
}
|
|
public void setSpiderCode(String spiderCode) {
|
this.spiderCode = spiderCode;
|
}
|
|
|
public void setSpiderName(String spiderName) {
|
this.spiderName = spiderName;
|
}
|
public String getTableName() {
|
return tableName;
|
}
|
public void setTableName(String tableName) {
|
this.tableName = tableName;
|
}
|
|
public void setCookies(List<Cookie> cookies) {
|
this.cookies = cookies;
|
}
|
public void setHeaders(Map<String, String> headers) {
|
this.headers = headers;
|
}
|
public String getUa() {
|
return ua;
|
}
|
public void setUa(String ua) {
|
this.ua = ua;
|
}
|
|
public void setPipelineList(List<Pipeline> pipelineList) {
|
this.pipelineList = pipelineList;
|
}
|
|
public void setProxyList(List<Proxy> proxyList) {
|
this.proxyList = proxyList;
|
}
|
|
public String getCharset() {
|
return charset;
|
}
|
public Long getSleepTime() {
|
return sleepTime;
|
}
|
public Integer getRetryTimes() {
|
return retryTimes;
|
}
|
|
public boolean isConvertImg() {
|
return convertImg;
|
}
|
|
}
|