具体的方案参见博客
代码实现如下:
maven依赖:
junit junit 4.4 test commons-lang commons-lang 2.6 compile org.apache.common commons-codec 1.2 net.sourceforge.nekohtml nekohtml 1.9.10 dom4j dom4j 1.6.1 jaxen jaxen 1.1.4
公共类:
import java.util.HashSet;import java.util.Set;import org.apache.commons.codec.digest.DigestUtils;/** * 训练集封装的value object * @date 2013-10-21 */public class ItemTrainVo { private ItemTrainVo() { super(); } /**更新实例的same number同时加入text对应的md5值 * @param insance ItemTrainVo实例 * @param xpath xpath * @param text 解析dom树节点对应的文本值 * @date 2013-10-21 */ public static void updateInstance(ItemTrainVo insance, String xpath, String text) { insance.setXpath(xpath); String md5Text = DigestUtils.md5Hex(text); if (insance.getMd5Texts().contains(md5Text)) { insance.setSameNum(insance.getSameNum() + 1L); } insance.getMd5Texts().add(md5Text); } /**创建一个空的实例 * @param xpath xpath * @param text 解析dom树节点对应的文本值 * @return 返回创建的实例 * @date 2013-10-21 */ public static ItemTrainVo getInstance(String xpath, String text) { ItemTrainVo insance = new ItemTrainVo(); insance.setXpath(xpath); String md5Text = DigestUtils.md5Hex(text); if (insance.getMd5Texts().contains(md5Text)) { insance.setSameNum(insance.getSameNum() + 1L); } insance.getMd5Texts().add(md5Text); return insance; } private String xpath; private Setmd5Texts = new HashSet (); private long sameNum = 0L; public String getXpath() { return xpath; } public void setXpath(String xpath) { this.xpath = xpath; } public Set getMd5Texts() { return md5Texts; } public void setMd5Texts(Set md5Texts) { this.md5Texts = md5Texts; } public long getSameNum() { return sameNum; } public void setSameNum(long sameNum) { this.sameNum = sameNum; } @Override public String toString() { return xpath + "=>" + md5Texts + ""; }}
抽象训练器,如果以后有获取css path的可以继承此类:
import java.math.BigDecimal;import java.util.HashSet;import java.util.Iterator;import java.util.Map;import java.util.Set;import java.util.Map.Entry;import com.panguso.techrd.parser.trainer.vo.ItemTrainVo;public abstract class BaseTrainer { /** * 基于概率阀值清理训练结果,保留符合条件的结果,返回最终的标签相对模板集合 * * @param map 训练中间数据传输对象map * @param q 概率阀值 * @throws Exception Exception * @date 2013-10-24 */ protected SetcleanResult(Map map, double q) throws Exception { Iterator > iterator = map.entrySet().iterator(); Set xpaths = new HashSet (); while (iterator.hasNext()) { Entry entry = iterator.next(); ItemTrainVo itv = entry.getValue(); BigDecimal dif = new BigDecimal(itv.getMd5Texts().size()); BigDecimal all = new BigDecimal(itv.getMd5Texts().size() + itv.getSameNum()); BigDecimal result = dif.divide(all, 3, BigDecimal.ROUND_HALF_DOWN); if (result.doubleValue() < q) { iterator.remove(); } else { xpaths.add(itv.getXpath()); } } return xpaths; } /** * 基于训练集提取其所有的text和需要处理的标签对应的path(css * path或者xpath基于具体的实现)作为key,value为map,其key为标签路径 * ,value为封装结构体,其中包括了相对标签path,所有的text值,以及出现相同次数,依此作为概率计算基数。 * * @param path path为训练集的根目录 * @param map 空的map * @throws Exception Exception * @date 2013-10-24 */ protected abstract void parseInfo(String path, Map map) throws Exception; /** * 对外暴漏的方法,提取最终的训练集合,map的key为标签path,value为其例子 * * @param fileDirPath 训练集根路径 * @return 训练结果 * @date 2013-10-24 */ public abstract Map train(String fileDirPath); /** * 获取除了text之外要提取的标签比如 * * @return 返回需要提取的标签列表 * @date 2013-10-24 */ protected abstract Set getUsefullTags(); /** * 获取需要过滤的标签比如 ,训练器当碰到之后就会忽略下层迭代处理 * * @return 返回要过滤的标签列表 * @date 2013-10-24 */ protected abstract Set getUnUsefullTags();}
基于xpath训练器的实现:
import java.io.File;import java.io.FileInputStream;import java.math.BigDecimal;import java.util.Collections;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import java.util.Set;import org.apache.commons.lang.StringUtils;import org.cyberneko.html.parsers.DOMParser;import org.dom4j.Attribute;import org.dom4j.Document;import org.dom4j.Element;import org.dom4j.io.DOMReader;import org.xml.sax.InputSource;import com.panguso.techrd.parser.trainer.vo.ItemTrainVo;/** * 基于概率的xpath训练器实现 * * @date 2013-10-24 */public class XPathTrainer extends BaseTrainer { private SetusefullTags = Collections.emptySet(); private Set unUsefullTags = Collections.emptySet(); public XPathTrainer(Set usefullTags, Set unUsefullTags) { super(); this.unUsefullTags = unUsefullTags; this.usefullTags = usefullTags; } @Override protected Set getUnUsefullTags() { return unUsefullTags; } @Override protected Set getUsefullTags() { return usefullTags; } @Override public Map train(String fileDirPath) { Map map = new HashMap (); Map result = new HashMap (); try { parseInfo(fileDirPath, map); Set xpaths = cleanResult(map, computQ(fileDirPath, true)); getResult(fileDirPath, result, xpaths); } catch (Exception e) { e.printStackTrace(); } return result; } private void getResult(String path, Map result, Set xpaths) throws Exception { File file = new File(path); if (!file.isDirectory()) { return; } File[] files = file.listFiles(); for (File file2 : files) { if (file2.isFile()) { DOMParser parser = new DOMParser(); parser.parse(new InputSource(new FileInputStream(file2))); DOMReader domReader = new DOMReader(); Document document = domReader.read(parser.getDocument()); Element root = document.getRootElement(); Iterator xpathIterator = xpaths.iterator(); while (xpathIterator.hasNext()) { String xpath = xpathIterator.next(); Element node = (Element) root.selectSingleNode(xpath); result.put(xpath, node.asXML()); } return; } } } private double computQ(String path, boolean defaultVal) { if (defaultVal) { return 1.0d; } File file = new File(path); int num = 0; File[] files = file.listFiles(); for (File file2 : files) { if (file2.isFile()) { num++; } } BigDecimal dif = new BigDecimal(num - 1); BigDecimal all = new BigDecimal(num); BigDecimal result = dif.divide(all, 3, BigDecimal.ROUND_HALF_DOWN); return result.doubleValue(); } @Override protected void parseInfo(String path, Map map) throws Exception { File file = new File(path); if (!file.isDirectory()) { return; } File[] files = file.listFiles(); for (File file2 : files) { if (file2.isDirectory()) { continue; } DOMParser parser = new DOMParser(); parser.parse(new InputSource(new FileInputStream(file2))); DOMReader domReader = new DOMReader(); Document document = domReader.read(parser.getDocument()); Element root = document.getRootElement(); dom2PathMap(root, map); } } @SuppressWarnings("unchecked") private void dom2PathMap(Element root, Map map) { if (this.getUnUsefullTags().contains(root.getName())) { return; } if (root == null || root.isTextOnly() || this.getUsefullTags().contains(root.getName())) { String text = root.getText(); String uniqXpath = root.getUniquePath(); String xpath = root.getPath(); if (StringUtils.isEmpty(text)) { Iterator iterator = root.attributeIterator(); while (iterator.hasNext()) { Attribute attr = iterator.next(); text = attr.getName() + "." + attr.getValue() + text; } } if (map.containsKey(uniqXpath)) { ItemTrainVo.updateInstance(map.get(uniqXpath), xpath, text); } else { map.put(uniqXpath, ItemTrainVo.getInstance(xpath, text)); } return; } Iterator iterator = root.elementIterator(); while (iterator.hasNext()) { Element el = iterator.next(); dom2PathMap(el, map); } }}
测试代码:
import java.util.HashSet;import java.util.Set;import org.junit.Test;import com.panguso.techrd.parser.trainer.service.XPathTrainer;public class XPathTrainerTest { @Test public final void testTrain() { SetusefullTags = new HashSet (); Set unUsefullTags = new HashSet (); usefullTags.add("IMG"); unUsefullTags.add("HEAD"); unUsefullTags.add("SCRIPT"); String path = "/dom/163/shehui"; XPathTrainer xt = new XPathTrainer(usefullTags, unUsefullTags); System.out.println(xt.train(path)); }}
测试集如附件: