博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
基于概率的网页正文页抽取代码实现
阅读量:4111 次
发布时间:2019-05-25

本文共 8456 字,大约阅读时间需要 28 分钟。

具体的方案参见博客

代码实现如下:

maven依赖:

junit
junit
4.4
test
commons-lang
commons-lang
2.6
compile
org.apache.common
commons-codec
1.2
net.sourceforge.nekohtml
nekohtml
1.9.10
dom4j
dom4j
1.6.1
jaxen
jaxen
1.1.4

 

公共类:

import java.util.HashSet;import java.util.Set;import org.apache.commons.codec.digest.DigestUtils;/** * 训练集封装的value object * @date 2013-10-21 */public class ItemTrainVo {	private ItemTrainVo() {	    super();    }	/**更新实例的same number同时加入text对应的md5值	 * @param insance ItemTrainVo实例	 * @param xpath xpath	 * @param text 解析dom树节点对应的文本值	 * @date 2013-10-21	 */	public static void updateInstance(ItemTrainVo insance, String xpath, String text) {		insance.setXpath(xpath);		String md5Text = DigestUtils.md5Hex(text);		if (insance.getMd5Texts().contains(md5Text)) {			insance.setSameNum(insance.getSameNum() + 1L);		}		insance.getMd5Texts().add(md5Text);	}	/**创建一个空的实例	 * @param xpath xpath	 * @param text 解析dom树节点对应的文本值	 * @return 返回创建的实例	 * @date 2013-10-21	 */	public static ItemTrainVo getInstance(String xpath, String text) {		ItemTrainVo insance = new ItemTrainVo();		insance.setXpath(xpath);		String md5Text = DigestUtils.md5Hex(text);		if (insance.getMd5Texts().contains(md5Text)) {			insance.setSameNum(insance.getSameNum() + 1L);		}		insance.getMd5Texts().add(md5Text);		return insance;	}	private String xpath;	private Set
md5Texts = new HashSet
(); private long sameNum = 0L; public String getXpath() { return xpath; } public void setXpath(String xpath) { this.xpath = xpath; } public Set
getMd5Texts() { return md5Texts; } public void setMd5Texts(Set
md5Texts) { this.md5Texts = md5Texts; } public long getSameNum() { return sameNum; } public void setSameNum(long sameNum) { this.sameNum = sameNum; } @Override public String toString() { return xpath + "=>" + md5Texts + ""; }}

抽象训练器,如果以后有获取css path的可以继承此类:

import java.math.BigDecimal;import java.util.HashSet;import java.util.Iterator;import java.util.Map;import java.util.Set;import java.util.Map.Entry;import com.panguso.techrd.parser.trainer.vo.ItemTrainVo;public abstract class BaseTrainer {	/**	 * 基于概率阀值清理训练结果,保留符合条件的结果,返回最终的标签相对模板集合	 * 	 * @param map 训练中间数据传输对象map	 * @param q 概率阀值	 * @throws Exception Exception	 * @date 2013-10-24	 */	protected Set
cleanResult(Map
map, double q) throws Exception { Iterator
> iterator = map.entrySet().iterator(); Set
xpaths = new HashSet
(); while (iterator.hasNext()) { Entry
entry = iterator.next(); ItemTrainVo itv = entry.getValue(); BigDecimal dif = new BigDecimal(itv.getMd5Texts().size()); BigDecimal all = new BigDecimal(itv.getMd5Texts().size() + itv.getSameNum()); BigDecimal result = dif.divide(all, 3, BigDecimal.ROUND_HALF_DOWN); if (result.doubleValue() < q) { iterator.remove(); } else { xpaths.add(itv.getXpath()); } } return xpaths; } /** * 基于训练集提取其所有的text和需要处理的标签对应的path(css * path或者xpath基于具体的实现)作为key,value为map,其key为标签路径 * ,value为封装结构体,其中包括了相对标签path,所有的text值,以及出现相同次数,依此作为概率计算基数。 * * @param path path为训练集的根目录 * @param map 空的map * @throws Exception Exception * @date 2013-10-24 */ protected abstract void parseInfo(String path, Map
map) throws Exception; /** * 对外暴漏的方法,提取最终的训练集合,map的key为标签path,value为其例子 * * @param fileDirPath 训练集根路径 * @return 训练结果 * @date 2013-10-24 */ public abstract Map
train(String fileDirPath); /** * 获取除了text之外要提取的标签比如
* * @return 返回需要提取的标签列表 * @date 2013-10-24 */ protected abstract Set
getUsefullTags(); /** * 获取需要过滤的标签比如
,训练器当碰到之后就会忽略下层迭代处理 * * @return 返回要过滤的标签列表 * @date 2013-10-24 */ protected abstract Set
getUnUsefullTags();}

基于xpath训练器的实现:

import java.io.File;import java.io.FileInputStream;import java.math.BigDecimal;import java.util.Collections;import java.util.HashMap;import java.util.Iterator;import java.util.Map;import java.util.Set;import org.apache.commons.lang.StringUtils;import org.cyberneko.html.parsers.DOMParser;import org.dom4j.Attribute;import org.dom4j.Document;import org.dom4j.Element;import org.dom4j.io.DOMReader;import org.xml.sax.InputSource;import com.panguso.techrd.parser.trainer.vo.ItemTrainVo;/** * 基于概率的xpath训练器实现 *  * @date 2013-10-24 */public class XPathTrainer extends BaseTrainer {	private Set
usefullTags = Collections.emptySet(); private Set
unUsefullTags = Collections.emptySet(); public XPathTrainer(Set
usefullTags, Set
unUsefullTags) { super(); this.unUsefullTags = unUsefullTags; this.usefullTags = usefullTags; } @Override protected Set
getUnUsefullTags() { return unUsefullTags; } @Override protected Set
getUsefullTags() { return usefullTags; } @Override public Map
train(String fileDirPath) { Map
map = new HashMap
(); Map
result = new HashMap
(); try { parseInfo(fileDirPath, map); Set
xpaths = cleanResult(map, computQ(fileDirPath, true)); getResult(fileDirPath, result, xpaths); } catch (Exception e) { e.printStackTrace(); } return result; } private void getResult(String path, Map
result, Set
xpaths) throws Exception { File file = new File(path); if (!file.isDirectory()) { return; } File[] files = file.listFiles(); for (File file2 : files) { if (file2.isFile()) { DOMParser parser = new DOMParser(); parser.parse(new InputSource(new FileInputStream(file2))); DOMReader domReader = new DOMReader(); Document document = domReader.read(parser.getDocument()); Element root = document.getRootElement(); Iterator
xpathIterator = xpaths.iterator(); while (xpathIterator.hasNext()) { String xpath = xpathIterator.next(); Element node = (Element) root.selectSingleNode(xpath); result.put(xpath, node.asXML()); } return; } } } private double computQ(String path, boolean defaultVal) { if (defaultVal) { return 1.0d; } File file = new File(path); int num = 0; File[] files = file.listFiles(); for (File file2 : files) { if (file2.isFile()) { num++; } } BigDecimal dif = new BigDecimal(num - 1); BigDecimal all = new BigDecimal(num); BigDecimal result = dif.divide(all, 3, BigDecimal.ROUND_HALF_DOWN); return result.doubleValue(); } @Override protected void parseInfo(String path, Map
map) throws Exception { File file = new File(path); if (!file.isDirectory()) { return; } File[] files = file.listFiles(); for (File file2 : files) { if (file2.isDirectory()) { continue; } DOMParser parser = new DOMParser(); parser.parse(new InputSource(new FileInputStream(file2))); DOMReader domReader = new DOMReader(); Document document = domReader.read(parser.getDocument()); Element root = document.getRootElement(); dom2PathMap(root, map); } } @SuppressWarnings("unchecked") private void dom2PathMap(Element root, Map
map) { if (this.getUnUsefullTags().contains(root.getName())) { return; } if (root == null || root.isTextOnly() || this.getUsefullTags().contains(root.getName())) { String text = root.getText(); String uniqXpath = root.getUniquePath(); String xpath = root.getPath(); if (StringUtils.isEmpty(text)) { Iterator
iterator = root.attributeIterator(); while (iterator.hasNext()) { Attribute attr = iterator.next(); text = attr.getName() + "." + attr.getValue() + text; } } if (map.containsKey(uniqXpath)) { ItemTrainVo.updateInstance(map.get(uniqXpath), xpath, text); } else { map.put(uniqXpath, ItemTrainVo.getInstance(xpath, text)); } return; } Iterator
iterator = root.elementIterator(); while (iterator.hasNext()) { Element el = iterator.next(); dom2PathMap(el, map); } }}

测试代码:

import java.util.HashSet;import java.util.Set;import org.junit.Test;import com.panguso.techrd.parser.trainer.service.XPathTrainer;public class XPathTrainerTest {	@Test	public final void testTrain() {		Set
usefullTags = new HashSet
(); Set
unUsefullTags = new HashSet
(); usefullTags.add("IMG"); unUsefullTags.add("HEAD"); unUsefullTags.add("SCRIPT"); String path = "/dom/163/shehui"; XPathTrainer xt = new XPathTrainer(usefullTags, unUsefullTags); System.out.println(xt.train(path)); }}

测试集如附件:

 

 

 

 

转载地址:http://tiqsi.baihongyu.com/

你可能感兴趣的文章
vue 项目中图片选择路径位置static 或 assets区别
查看>>
vue项目打包后无法运行报错空白页面
查看>>
Vue 解决部署到服务器后或者build之后Element UI图标不显示问题(404错误)
查看>>
element-ui全局自定义主题
查看>>
facebook库runtime.js
查看>>
vue2.* 中 使用socket.io
查看>>
openlayers安装引用
查看>>
js报错显示subString/subStr is not a function
查看>>
高德地图js API实现鼠标悬浮于点标记时弹出信息窗体显示详情,点击点标记放大地图操作
查看>>
初始化VUE项目报错
查看>>
vue项目使用安装sass
查看>>
HTTP和HttpServletRequest 要点
查看>>
在osg场景中使用GLSL语言——一个例子
查看>>
关于无线PCB中 中50欧姆的特性阻抗的注意事项
查看>>
Spring的单例模式源码小窥
查看>>
后台服务的变慢排查思路(轻量级应用服务器中测试)
查看>>
MySQL中InnoDB事务的默认隔离级别测试
查看>>
微服务的注册与发现
查看>>
bash: service: command not found
查看>>
linux Crontab 使用 --定时任务
查看>>