Demo entry 6346869

test

   

Submitted by anonymous on Feb 13, 2017 at 09:08
Language: Java. Code size: 21.1 kB.

package com.jd.sem.brain.business.campaign.impl;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import javax.annotation.Resource;

import org.joda.time.LocalDateTime;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor;

import au.com.bytecode.opencsv.CSVWriter;

import com.jd.sem.brain.business.campaign.GenTask;
import com.jd.sem.brain.business.gen.GenService;
import com.jd.sem.brain.business.gen.impl.GenServiceImpl;
import com.jd.sem.brain.business.http.impl.HttpServiceImpl;
import com.jd.sem.brain.common.model.UrlSet;
import com.jd.sem.brain.common.model.words.GenParam;
import com.jd.sem.brain.common.model.words.Word;
import com.jd.sem.brain.common.model.words.WordsParam;
import com.jd.sem.brain.common.util.cache.map.impl.MemoryCacheImpl;
import com.jd.sem.brain.rpc.jss.JssService;

/***
 * 
 * @author chenlong
 *
 */
public class GenTaskImpl implements GenTask, Callable<String> {

	protected static Pattern PatBrac = Pattern.compile("[\\((]([^\\))]+)[)\\)]");
	protected static Pattern PatNum = Pattern.compile("([0-9,]+)");
	protected final Logger LOG = LoggerFactory.getLogger(getClass());
	protected static List<String> RelatedDomains = Arrays.asList("zol.com.cn", "product.yesky.com", "product.pcpop.com",
			"suning.com", "www.gome.com.cn", "www.taobao.com", "www.tmall.com", "amazon.cn", "etao.com", "jd.com",
			"www.manmanbuy.com", "1688.com");
	protected static List<String> NegativeDomains = Arrays.asList("eastmoney.com", "10jqka.com.cn", "tieba.baidu.com");
	protected static int MAX_EXPAND_WORDS = 65536;
	protected static String USER_AGENT = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36";

	/** 当前任务状态 */
	public enum ExpandStage {
		Initing("未开始", 0),
		Crawling("抓取活动页", 1),
		Expanding("扩词中", 2),
		SERPing("分析中", 3),
		Cleaning("清洗中", 4),
		Outputing("输出中", 5),
		Finished("已结束", 6),
		;
		private String title;
		private int step;
		ExpandStage(String title, int step){
			this.setTitle(title);
			this.step = step;
		}
		public int getStep() {
			return step;
		}
		public String getTitle() {
			return title;
		}
		public void setTitle(String title) {
			this.title = title;
		}
	}
	/** 着陆方式  */
	public enum LandingType {
		Item("单品页"),
		Search("搜索页"),
		Act("活动页"),
		Custom("指定着陆页"),
		;
		private String title;
		LandingType(String title){
			this.setTitle(title);
		}
		public String getTitle() {
			return title;
		}
		public void setTitle(String title) {
			this.title = title;
		}
	}
	
	/** 种子词信息 */
	public class SeedInfo {
		private String firstCategory;
		private String secondCategory;
		private String thirdCategory;
		private String brand;
		private String brandAlias;
		private String itemPage;
		
		public String getFirstCategory() {
			return this.firstCategory;
		}
		
		public String getSecondCategory() {
			return this.secondCategory;
		}
		
		public String getThirdCategory() {
			return this.thirdCategory;
		}
		
		public String getBrand() {
			return this.brand;
		}
		
		public String getItemPage() {
			return this.itemPage;
		}
		
		public SeedInfo(String fc, String sc, String tc, String br, String source){
			this.firstCategory = fc;
			this.secondCategory = sc;
			this.thirdCategory = tc;
			this.brand = br;
			this.itemPage = source;
			//this.itemName = it;
			
			Matcher m = PatBrac.matcher(this.brand);
			this.brandAlias = "";
			if (m.find())
				this.brandAlias = m.group(1);
			this.brand = this.brand.replaceAll("[\\((][^\\))]+[)\\)]", "");
		}
	}
	
	public static class SerpResult {
		private String keyword;
		private int nResult;
		private int nAd;
		private int nFoeAd;
		private int nRedSerp;
		private int nFoeSerp;
		private float relatedScore;
		private float competiveScore;

		public SerpResult(String keyword, int nResult, int nAd, int nFoeAd, int nRedSerp, int nFoeSerp) {
			this.keyword = keyword;
			this.nResult = nResult;
			this.nAd = nAd;
			this.nFoeAd = nFoeAd;
			this.nRedSerp = nRedSerp;
			this.nFoeSerp = nFoeSerp;

			if (nFoeSerp == 0 || nRedSerp == 0)
				this.relatedScore = (float) 0.0;
			else
				this.relatedScore = (float) ((this.nRedSerp + this.nFoeSerp) * 1.0 / 10.0 + 0.4 * this.nFoeAd);
			this.competiveScore = (float) (this.nAd + this.nFoeAd * 2);
		}

		public String getKeyword() {
			return this.keyword;
		}
		public int getNResult() {
			return this.nResult;
		}
		public float getRelatedScore() {
			return relatedScore;
		}
		public float getCompetiveScore() {
			return competiveScore;
		}
	}

	public class ExWord {
		private float relatedScore;
		private float competiveScore;
		private Word word;
		private boolean valid;
		private String firstCategory;
		private String secondCategory;
		private String thirdCategory;
		private String brand;
		private String itemPage;
		
		public float getRelatedScore() {
			return relatedScore;
		}
		public void setRelatedScore(float relatedScore) {
			this.relatedScore = relatedScore;
		}
		public float getCompetiveScore() {
			return competiveScore;
		}
		public void setCompetiveScore(float competiveScore) {
			this.competiveScore = competiveScore;
		}
		public Word getWord() {
			return word;
		}
		public ExWord(Word w) {
			this.word = w;
			this.relatedScore = (float) 0.0;
			this.competiveScore = (float) 0.0;
		}
		public boolean isValid() {
			return valid;
		}
		public void setValid(boolean valid) {
			this.valid = valid;
		}
		public String getFirstCategory() {
			return firstCategory;
		}
		public void setFirstCategory(String firstCategory) {
			this.firstCategory = firstCategory;
		}
		public String getSecondCategory() {
			return secondCategory;
		}
		public void setSecondCategory(String secondCategory) {
			this.secondCategory = secondCategory;
		}
		public String getThirdCategory() {
			return thirdCategory;
		}
		public void setThirdCategory(String thirdCategory) {
			this.thirdCategory = thirdCategory;
		}
		public String getBrand() {
			return brand;
		}
		public void setBrand(String brand) {
			this.brand = brand;
		}
		public String getItemPage() {
			return itemPage;
		}
		public void setItemPage(String itemPage) {
			this.itemPage = itemPage;
		}
	}

	/** 从单品页获取的种子词 */
	private Map<String, SeedInfo> activeSeeds;
	
	/** 当前状态 */
	private ExpandStage stage;

	private LocalDateTime taskStartTime;
	private LocalDateTime taskFinishedTime;
	private boolean taskSuccess;
	private String taskFailedReason;
	private String taskResultFile;
	
	/** 着陆页类型 */
	private LandingType landingType;
	
	/** 抓取商品数量限制 */
	private int itemLimit;

	/** 抓取时的伪造HTTP头 */
	private Map<String, String> httpHeader;
	
	/** 指定活动页 */
	private URI activePage;
	
	/** 扩词结果 */
	private List<ExWord> results;
	
	/** 本地临时文件目录 */
	private String localTmpWorkDir;
	
	private ThreadPoolTaskExecutor threadPoolTaskExecutor;

	/** HTTP页面下载工具 */
	HttpServiceImpl httpService;

	/** 抓取拓词工具 */
	GenServiceImpl genService;
	
	JssService jssService;
	
	/** 扩词参数 */
	WordsParam wordsParam;
	
	/** 扩词行为参数 */
	GenParam genParam;
	
	/** 拓词新词得分下限 */
	float scoreThreshold;
	
	GenTaskImpl(GenServiceImpl genService, URI activityUrl,
			LandingType landingType, int itemLimit, float scoreThreshold){
		this.setStage(ExpandStage.Initing);
		
		this.setActivePage(activityUrl);
		this.activeSeeds = new HashMap<String, SeedInfo>();
		MemoryCacheImpl memoryCache = new MemoryCacheImpl(100);
		this.httpService = new HttpServiceImpl();
		this.httpService.setMemoryCache(memoryCache);
		this.genService = genService;
		this.itemLimit = itemLimit;
		this.landingType = landingType;
		this.scoreThreshold = scoreThreshold;

		/*
		List<GenWords> gws = new ArrayList<GenWords>();
		//BaiduApiGenWordsImpl bAGWI = new BaiduApiGenWordsImpl();
		TaobaoRelatedGenWordsImpl tRGWI = new TaobaoRelatedGenWordsImpl();
		ShenmaRelatedGenWordsImpl sRGWI = new ShenmaRelatedGenWordsImpl();
		BaiduRelatedGenWordsImpl bRGWI = new BaiduRelatedGenWordsImpl();
		//bAGWI.setUrlSet(new UrlSet());
		//bAGWI.setHttpService(this.httpService);
		tRGWI.setUrlSet(new UrlSet());
		tRGWI.setHttpService(this.httpService);
		sRGWI.setUrlSet(new UrlSet());
		sRGWI.setHttpService(this.httpService);
		bRGWI.setUrlSet(new UrlSet());
		bRGWI.setHttpService(this.httpService);
		//gws.add(bAGWI);
		gws.add(tRGWI);
		gws.add(sRGWI);
		gws.add(bRGWI);
		this.genService.setGenWordsList(gws);
		*/
		
		this.genService.setUrlSet(new UrlSet());
		
		/** 任务结果初始化 */
		this.results = new ArrayList<ExWord>();
		this.taskSuccess = true;
		this.taskFailedReason = "";
		
		/** 设置扩词参数 */
		this.wordsParam = new WordsParam();
		this.wordsParam.setEnableBlackList(true);
		this.wordsParam.setLevel(1);
		this.wordsParam.setEnableDupFilter(true);
		this.wordsParam.setSourceList(Arrays.asList("百度API拓词",
			"百度下拉相关词"));
		this.genParam = new GenParam();
		
		/** 构造HTTP头 */
		this.httpHeader = new HashMap<String, String>();
		this.httpHeader.put("User-Agent", USER_AGENT);
		this.httpHeader.put("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
		this.httpHeader.put("Accept-Encoding", "gzip,deflate,sdch");
		this.httpHeader.put("Accept-Language", "zh-CN,zh;q=0.8");
	}
	
	public static int countDomain(String url, List<String> domains){
		for (String domain:domains){
			if (url.contains(domain))
				return 1;
		}
		return 0;
	}

	/** SERP分析 */
	public static SerpResult analysSerp(Document serp, String keyword) {
		String resultText = serp.select("div.head_nums_cont_inner div.nums").text();
		Matcher resultMatcher = PatNum.matcher(resultText);
		String resultNum = ""; 
		try {
			resultNum = resultMatcher.group(0);
		} catch (IllegalStateException e) {
			resultNum = "0";
		}
		int nResult = Integer.parseInt(resultNum.replace(",", ""));

		int nAd = 0, nFoeAd = 0, nRedSerp = 0, nFoeSerp = 0;
		for (Element element:serp.select("div#content_left > div")) {
			if (element.attr("class").contains("result"))
			{
				String stripKeyword = keyword;
				for (Element em:element.select("h3 > a > em")) {
					stripKeyword = stripKeyword.replace(em.text(), "");
				}
				if (stripKeyword.length() < 1)
					nRedSerp++;
				String showUrl = element.select("div a.c-showurl").text();
				if (countDomain(showUrl, RelatedDomains) > 0)
					nFoeSerp++;
			} else {
				/*
				 * pp广告标识:id为x00x
				 * ppim和im广告标识:data-pos=12或13,其下是若干id为x00x的广告
				 */
				if (element.hasAttr("id") || element.hasAttr("data-pos"))
				{
					for (Element span:element.select("a span")) {
						String showUrl = span.text();
						if (countDomain(showUrl, RelatedDomains) > 0)
							nFoeAd++;
					}
				}
				nAd++;
			}
		}
		
		return new SerpResult(keyword, nResult, nAd, nFoeAd, nRedSerp, nFoeSerp);
	}

	public void crawlPage() {
		this.setStage(ExpandStage.Crawling);
		Pattern patHref = Pattern.compile("href=\"([^\"]+)\"");
		String document = "";
		try {
			document = Jsoup.connect(this.activePage.toString())
					.userAgent(USER_AGENT).get().toString();
		} catch (IOException e) {
			e.printStackTrace();
		}
		Matcher matcher = patHref.matcher(document);
		Set<URI> itemUrls = new HashSet<URI>();
		while (matcher.find())
		{
			String href = matcher.group(1);
			if (! href.contains("item.jd.com"))
				continue;
			try {
				itemUrls.add(this.activePage.resolve(href));
			} catch (Exception e) {
				LOG.error("href resolve error:", e);
			}
			if (itemUrls.size() >= this.itemLimit)
				break;
		}

		for (URI itemUrl:itemUrls) {
			Document doc;
			try {
				doc = Jsoup.connect(itemUrl.toString())
						.userAgent(USER_AGENT).get();
			} catch (IOException e) {
				e.printStackTrace();
				continue;
			} 
			//this.httpService.getDocument(itemUrl.toString(), this.httpHeader, "UTF-8");
			Elements elements = doc.select("div.breadcrumb a");
			if (elements.size() < 1) {
				elements = doc.select("div.crumb.fl.clearfix a[clstag~=.*shangpin*]");
			}
			if (elements.size() != 4 && elements.size() != 5) {
				LOG.info("Item url: {} can't parse breadcrumb", itemUrl.toString());
				continue;
			}
			List<String> paths = new ArrayList<String>();
			for (Element element:elements){
				String text = element.text().toLowerCase();
				if (text.length() > 0)
					paths.add(text);
			}
			SeedInfo seedInfo = new SeedInfo(paths.get(0), paths.get(1), paths.get(2), 
					paths.get(3), itemUrl.toString());
			//this.activeSeeds.put(seedInfo.thirdCategory, seedInfo);
			this.activeSeeds.put(paths.get(paths.size() - 1), seedInfo);
			this.activeSeeds.put(seedInfo.brand + seedInfo.thirdCategory, seedInfo);
			if (seedInfo.brandAlias.length() > 0)
				this.activeSeeds.put(seedInfo.brandAlias + seedInfo.thirdCategory, seedInfo);
		}
	}

	public void expand() {
		this.setStage(ExpandStage.Expanding);
		List<Word> relatedWords = new ArrayList<Word>();
		for (String activeSeed:this.activeSeeds.keySet()) {
			Word word = new Word(activeSeed);
			relatedWords.add(word);
		}
		this.genParam.setRelatedWords(relatedWords);
		this.genParam.setFirstCateId((long) 0);
		this.genParam.setSecondCateId((long) 0);
		this.genParam.setThirdCateId((long) 0);
		HashSet<String> uniqueWord = new HashSet<String>();
		if (relatedWords.size() > 0)
			for (Word w:this.genService.genWords(this.wordsParam, this.genParam))
			{
				if (uniqueWord.contains(w.getWord()))
					continue;
				uniqueWord.add(w.getWord());

				ExWord exWord = new ExWord(w);
				SeedInfo tmpSeed = this.activeSeeds.get(w.getRelatedWord());
				if (tmpSeed == null || w.getWord().length() > 14)
					continue;
				exWord.setFirstCategory(tmpSeed.getFirstCategory());
				exWord.setSecondCategory(tmpSeed.getSecondCategory());
				exWord.setThirdCategory(tmpSeed.getThirdCategory());
				exWord.setBrand(tmpSeed.getBrand());
				exWord.setItemPage(tmpSeed.getItemPage());
				exWord.setValid(true);
				this.results.add(exWord);
			}
	}

	public void search() {
		this.setStage(ExpandStage.SERPing);
		UrlSet urlSet = new UrlSet();
		List<Future<SerpResult>> serpResults = new 
				ArrayList<Future<SerpResult>>();
		List<ExWord> resultList = new ArrayList<ExWord>();
		for (final ExWord result:this.results) {
			String param;
			try {
				param = URLEncoder.encode(result.getWord().getWord(), "UTF-8");
			} catch (UnsupportedEncodingException e) {
				LOG.error("GenTaskImpl: {} can't urlencode, error:{}", result.getWord(), e.toString());
				continue;
			}
			final String url = String.format(urlSet.getBaiduSearchUrl(), param);
			
			Future<SerpResult> future = this.threadPoolTaskExecutor
					.submit(new Callable<SerpResult>(){
						public SerpResult call() {
							Document doc = null;
							try {
								doc = Jsoup.connect(url).userAgent(USER_AGENT).timeout(300).get();
							} catch (IOException e) {
								return null;
							}
							return GenTaskImpl.analysSerp(doc, result.getWord().getWord());
						}
					});
			serpResults.add(future);
			resultList.add(result);
			if (serpResults.size() > 10) {
				for (int i=0;i<serpResults.size();i++) {
					SerpResult sr = null;
					try {
						sr = serpResults.get(i).get();
					} catch (InterruptedException e) {
						continue;
					} catch (ExecutionException e) {
						continue;
					}
					if (sr == null)
						continue;
					//LOG.info("keyword:{} SERP:{} {} {} {} {}", sr.getKeyword(),
					//		sr.nAd, sr.nFoeAd, sr.nFoeSerp, sr.nRedSerp, sr.getRelatedScore());
					ExWord exWord = resultList.get(i);
					exWord.setRelatedScore(sr.getRelatedScore());
					exWord.setCompetiveScore(sr.getCompetiveScore());
				}
				serpResults.clear();
				resultList.clear();
			}
		}
	}

	/** 数据清洗 */
	public void clean() {
		this.setStage(ExpandStage.Cleaning);
		for (ExWord result:this.results) {
			if (result.getRelatedScore() <= this.scoreThreshold)
				/** 相关性太低,放弃 */
				result.setValid(false);
		}
	}

	/** 组装输出 
	 * @throws FileNotFoundException */
	@SuppressWarnings("resource")
	public void output() throws Exception {
		this.setStage(ExpandStage.Outputing);
		String fingerPrint = Long.toString(System.currentTimeMillis()) + "_" +
				Integer.toString((int)(Math.random() * 10000)) + ".csv";
		Path tmpFilePath = Paths.get(this.localTmpWorkDir, fingerPrint);
		BufferedWriter bWriter = 
			new BufferedWriter(
				new OutputStreamWriter(
					new FileOutputStream(tmpFilePath.toString()), "GBK"));
		CSVWriter csvWriter = new CSVWriter(bWriter);
		csvWriter.writeNext(new String[]{
			"种子词", "一级类目", "二级类目", "三级类目", "品牌",
			"新关键词", "出价", "匹配模式", "着陆页", "相关系数"
		});
		for (ExWord result:this.results) {
			if (! result.isValid())
				continue;
			Word tmpWord = result.getWord();
			String landingPage = tmpWord.getUrl();
			if (this.landingType == LandingType.Act)
				landingPage = this.activePage.toString();
			else if (this.landingType == LandingType.Item)
				landingPage = result.getItemPage();
			csvWriter.writeNext(new String[]{ tmpWord.getRelatedWord(),
					result.getFirstCategory(), result.getSecondCategory(),
					result.getThirdCategory(), result.getBrand(),
					tmpWord.getWord(), Double.toString(tmpWord.getPrice()),
					tmpWord.getMatchName(), landingPage, Float.toString(result.getRelatedScore())
			});
			bWriter.flush();
		}
		
		File tmpFile = new File(tmpFilePath.toString());
		this.setTaskResultFile(jssService.upload(fingerPrint, 
				tmpFile));
		this.setStage(ExpandStage.Finished);
	}
	
	public void run() throws Exception{
		this.taskStartTime = new LocalDateTime();
		crawlPage();
		expand();
		search();
		clean();
		output();
		this.taskFinishedTime = new LocalDateTime();
	}

	@Override
	public String call() {
		try {
			run();
		} catch (Exception e) {
			this.taskSuccess = false;
			this.taskFailedReason = e.toString();
			e.printStackTrace();
			return e.toString();
		}
		this.taskSuccess = true;
		this.taskFailedReason = "";
		return null;
	}

	public ExpandStage getStage() {
		return stage;
	}

	public void setStage(ExpandStage stage) {
		this.stage = stage;
	}
	
	public void setActivePage(URI activePage) {
		this.activePage = activePage;
	}
	
	public boolean getTaskSuccess() {
		return this.taskSuccess;
	}
	
	public String getTaskFailedReason() {
		return this.taskFailedReason;
	}
	
	public Map<String, SeedInfo> getSeeds() {
		return this.activeSeeds;
	}
	
	public int getSeedsNum() {
		return this.activeSeeds.size();
	}
	
	public int getResultNum() {
		int cnt = 0;
		for (ExWord result:this.results) {
			if (result.isValid())
				cnt++;
		}
		return cnt;
	}
	
	public LocalDateTime getStartTime() {
		return this.taskStartTime;
	}
	
	public LocalDateTime getFinishedTime() {
		return this.taskFinishedTime;
	}
	
	public void setTaskResultFile(String filePath) {
		this.taskResultFile = filePath;
	}

	public String getTaskResultFile() {
		return this.taskResultFile;
	}
	
	public static void main(String[] args) throws URISyntaxException, IOException {
		/*
		GenTaskImpl gti = new GenTaskImpl();
		URI uri = new URI("https://sale.jd.com/act/RHi8BPAsgUpF4N6.html");
		gti.setActivePage(uri);
		gti.crawlPage();
		gti.expand();
		gti.search();
		*/
	}

	public String getLocalTmpWorkDir() {
		return localTmpWorkDir;
	}

	public void setLocalTmpWorkDir(String localTmpWorkDir) {
		this.localTmpWorkDir = localTmpWorkDir;
	}

	public ThreadPoolTaskExecutor getThreadPoolTaskExecutor() {
		return threadPoolTaskExecutor;
	}

	public void setThreadPoolTaskExecutor(ThreadPoolTaskExecutor threadPoolTaskExecutor) {
		this.threadPoolTaskExecutor = threadPoolTaskExecutor;
		this.genService.setThreadPoolTaskExecutor(this.threadPoolTaskExecutor);
	}
	
	public void setJssService(JssService jssService) {
		this.jssService = jssService;
	}
}

This snippet took 0.04 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).