package com.javaweb.spider.backend; import com.javaweb.common.core.domain.AjaxResult; import com.javaweb.common.core.domain.ICallBack; import com.javaweb.common.utils.spring.SpringUtils; import com.javaweb.spider.config.SpiderConstants; import com.javaweb.spider.domain.SpiderConfig; import com.javaweb.spider.domain.SpiderField; import com.javaweb.spider.domain.SpiderFiledRule; import com.javaweb.spider.domain.SpiderMission; import com.javaweb.spider.mapper.SpiderFieldMapper; import com.javaweb.spider.mapper.SpiderFiledRuleMapper; import com.javaweb.spider.processer.AbstractProcessor; import com.javaweb.spider.processer.DefalutProcessor; import com.javaweb.spider.service.ISpiderConfigService; import com.javaweb.spider.service.ISpiderMissionService; import org.apache.commons.lang3.StringUtils; import org.assertj.core.util.Lists; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.context.annotation.Scope; import org.springframework.stereotype.Service; import java.util.*; import java.util.concurrent.CopyOnWriteArrayList; /** * 任务爬虫入口类,通过传入爬虫任务id,从数据库中查询配置参数 * 该种方式的爬虫相比快速配置的FastSpiderBackendService来比,它可以设置字段值处理规则。 */ public class SpiderBackendService extends Thread { protected final Logger logger = LoggerFactory.getLogger(SpiderBackendService.class); private String missionId; private ICallBack callBack; private ISpiderMissionService spiderMissionService= SpringUtils.getBean(ISpiderMissionService.class);//爬虫任务 private ISpiderConfigService spiderConfigService=SpringUtils.getBean(ISpiderConfigService.class);//爬虫配置 private SpiderFieldMapper spiderFieldMapper=SpringUtils.getBean(SpiderFieldMapper.class);//爬虫字段 private SpiderFiledRuleMapper spiderFiledRuleMapper=SpringUtils.getBean(SpiderFiledRuleMapper.class);//爬虫字段值处理规则 public SpiderBackendService(String missionId){ this.missionId=missionId; } public SpiderBackendService(String missionId,ICallBack callBack){ this.missionId=missionId; this.callBack=callBack; } @Override public void run() { SpiderMission mission=spiderMissionService.selectSpiderMissionById(Long.valueOf(missionId)); if(mission!=null){ if(SpiderConstants.SPIDER_MISSION_STATUS_RUNNING.equals(mission.getStatus())){ logger.warn(">>>>>>>>>>>>>>>爬虫任务["+missionId+"]已经在运行!本次不在执行!<<<<<<<<<"); return; } //查询爬虫配置 Long configId=mission.getSpiderConfigId(); SpiderConfig config = spiderConfigService.selectSpiderConfigById(configId); //查询字段配置 SpiderField queryForm=new SpiderField(); queryForm.setConfigId(config.getId()); List fields = spiderFieldMapper.selectSpiderFieldList(queryForm); config.setFieldsList(fields); //设置字段值处理规则 for(SpiderField field:fields){ SpiderFiledRule ruleQueryForm=new SpiderFiledRule(); ruleQueryForm.setFieldId(field.getFieldId().toString()); List rules = spiderFiledRuleMapper.selectSpiderFiledRuleList(ruleQueryForm); field.setFieldRules(rules); } //设置入口地址 String entryUrls=mission.getEntryUrls(); List urls= Lists.newArrayList(); if(StringUtils.isNotEmpty(entryUrls)){ String[] arr=entryUrls.split(","); for(String s:arr){ if(StringUtils.isNotEmpty(s)&&isURL(s)){ urls.add(s); } if(StringUtils.isNotEmpty(s)&&!isURL(s)){ logger.warn(">>>>>>>>>>>>>>>配置的url:["+s+"]不是一个有效的url!"); } } } config.setEntryUrlsList(urls); //设置退出方式 config.setExitWay(mission.getExitWay()); Long c= mission.getExitWayCount(); if(c==null){ c=0L; } config.setCount(Integer.valueOf(c.toString())); if(StringUtils.isNotEmpty(mission.getHeaderStr())){ config.setHeader(mission.getHeaderStr()); } if(StringUtils.isNotEmpty(mission.getCookieStr())){ config.setCookie(mission.getCookieStr()); } AbstractProcessor processor=new DefalutProcessor(config,missionId.toString()); mission.setStatus(SpiderConstants.SPIDER_MISSION_STATUS_RUNNING); mission.setStartTime(new Date()); spiderMissionService.updateSpiderMission(mission); CopyOnWriteArrayList> datas = processor.execute();//执行爬虫 mission.setEndTime(new Date()); mission.setStatus(SpiderConstants.SPIDER_MISSION_STATUS_DONE); mission.setSuccessNum(Long.valueOf(datas.size())); Long count=(mission.getEndTime().getTime()-mission.getStartTime().getTime())/1000; mission.setTimeCost(count.toString()); spiderMissionService.updateSpiderMission(mission); if(callBack!=null){ Map>> rmap=new HashMap(); rmap.put("datas",datas); callBack.setParams(rmap); callBack.onSuccess(); } } } private static boolean isURL(String str){ str = str.toLowerCase(); String regex = "(https?|ftp|file)://[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]"; return str.matches(regex); } }