English

使用海星+XSLT模板,抽取谷歌搜索结果相当简单

  1. 使用FIREBUG, 分析搜索结果的出现规律,用XPATH表示出来 : //li[@class='g']

  2. 使用XPather, 检查所得到的XPATH是否正确

  3. 使用XPather, 分析标题及链接

  4. 使用XPather, 分析代码片断

  5. 使用XPather, 获取正确代码片断

示例代码

TestSeastarGoogle.java : 使用海星+XSLT模板+Saxon+XStream抽取谷歌搜索结果 下载

package test.xslt;

import java.io.StringWriter;
import java.util.List;

import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.IOUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.DocumentSource;

import com.thoughtworks.xstream.XStream;
import com.zhsoft88.commons.Seastar;
import com.zhsoft88.commons.Seastar.SeastarResult;

/**
 * Test of Seastar: google search results analysis 
 * @author zhsoft88
 * @since 2008-08-25
 */
public class TestSeastarGoogle {

	public static class SearchResult {
		
		private String title;
		private String url;
		private String snippets;
		
		public SearchResult() {
			
		}

		public String getTitle() {
			return title;
		}

		public void setTitle(String title) {
			this.title = title;
		}

		public String getUrl() {
			return url;
		}

		public void setUrl(String url) {
			this.url = url;
		}

		public String getSnippets() {
			return snippets;
		}

		public void setSnippets(String snippets) {
			this.snippets = snippets;
		}
		
		@Override
		public String toString() {
			return "SearchResult[title="+title+",url="+url+",snippets="+snippets+"]";
		}
	}
	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl");
		// 1. use httpclient, get html source
		HttpClient client = new HttpClient();
		GetMethod get = new GetMethod("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&q=%E6%B5%B7%E6%98%9F%E5%9E%82%E7%9B%B4%E6%90%9C%E7%B4%A2&btnG=Google+%E6%90%9C%E7%B4%A2&meta=&aq=f");
		client.executeMethod(get);
		String origContent = IOUtils.toString(get.getResponseBodyAsStream(),"gbk");
		// 2. use seastar, transform html to dom (xml)
		Seastar ss = new Seastar();
		SeastarResult result = ss.structString(origContent);
		Document doc = DocumentHelper.parseText(result.getContents());
		// 3. use saxon/xslt, extract all search result
		TransformerFactory factory = TransformerFactory.newInstance();
		Transformer transformer = factory.newTransformer( new StreamSource(TestSeastarGoogle.class.getResourceAsStream("google.xsl")) );
		DocumentSource source = new DocumentSource( doc );
		source.setSystemId("http://www.google.cn/");
		StringWriter writer = new StringWriter();
		StreamResult res = new StreamResult(writer);
		transformer.transform( source, res );
		// 4. use xstream, transform xml to objects
		XStream xs = new XStream();
		xs.alias("search-result", SearchResult.class);
		Document resultDom = DocumentHelper.parseText(writer.toString());
		List<Element> list = resultDom.selectNodes("//search-result");
		for (Element e : list) {
			SearchResult item = (SearchResult)xs.fromXML(e.asXML());
			System.out.println(item);
		}
	}

}

google.xsl : XSLT模板 下载

<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:saxon="http://saxon.sf.net/"
    xmlns:my="myFunctions"
    extension-element-prefixes="saxon my"
    >
    <xsl:function name="my:merge"><xsl:param name="a"/><xsl:value-of select="$a"/></xsl:function>
    <xsl:template match="/">
    <results>
    <xsl:for-each select="//li[@class='g']">
        <search-result>
            <title><xsl:value-of select="h3/a"/></title>
            <url><xsl:value-of select="h3/a/@href"/></url>
            <snippets><xsl:value-of select="my:merge(div[@class='s']/node()[not(self::cite or self::span or self::br)])"/></snippets>
        </search-result>
    </xsl:for-each>
    </results>
</xsl:template>
</xsl:stylesheet>

产品族: 海狮 海猫 海葵 海蛛 海鹞 海星 海狗 WBXL Xultray webapp
iDocSet iDocSetHelper 雨燕 templateJS skiafy tranid 犀利播放器 犀利助手 网址导航 原创歌曲
(C) 2024 抓糖网 版权所有

update: 2013-06-07