TestSeastarGoogle.java : 使用海星+XSLT模板+Saxon+XStream抽取谷歌搜索结果 下载
package test.xslt;
import java.io.StringWriter;
import java.util.List;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.IOUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.DocumentSource;
import com.thoughtworks.xstream.XStream;
import com.zhsoft88.commons.Seastar;
import com.zhsoft88.commons.Seastar.SeastarResult;
/**
* Test of Seastar: google search results analysis
* @author zhsoft88
* @since 2008-08-25
*/
public class TestSeastarGoogle {
public static class SearchResult {
private String title;
private String url;
private String snippets;
public SearchResult() {
}
public String getTitle() {
return title;
}
public void setTitle(String title) {
this.title = title;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getSnippets() {
return snippets;
}
public void setSnippets(String snippets) {
this.snippets = snippets;
}
@Override
public String toString() {
return "SearchResult[title="+title+",url="+url+",snippets="+snippets+"]";
}
}
/**
* @param args
*/
public static void main(String[] args) throws Exception {
System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl");
// 1. use httpclient, get html source
HttpClient client = new HttpClient();
GetMethod get = new GetMethod("http://www.google.cn/search?complete=1&hl=zh-CN&newwindow=1&q=%E6%B5%B7%E6%98%9F%E5%9E%82%E7%9B%B4%E6%90%9C%E7%B4%A2&btnG=Google+%E6%90%9C%E7%B4%A2&meta=&aq=f");
client.executeMethod(get);
String origContent = IOUtils.toString(get.getResponseBodyAsStream(),"gbk");
// 2. use seastar, transform html to dom (xml)
Seastar ss = new Seastar();
SeastarResult result = ss.structString(origContent);
Document doc = DocumentHelper.parseText(result.getContents());
// 3. use saxon/xslt, extract all search result
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer( new StreamSource(TestSeastarGoogle.class.getResourceAsStream("google.xsl")) );
DocumentSource source = new DocumentSource( doc );
source.setSystemId("http://www.google.cn/");
StringWriter writer = new StringWriter();
StreamResult res = new StreamResult(writer);
transformer.transform( source, res );
// 4. use xstream, transform xml to objects
XStream xs = new XStream();
xs.alias("search-result", SearchResult.class);
Document resultDom = DocumentHelper.parseText(writer.toString());
List<Element> list = resultDom.selectNodes("//search-result");
for (Element e : list) {
SearchResult item = (SearchResult)xs.fromXML(e.asXML());
System.out.println(item);
}
}
}
google.xsl : XSLT模板 下载
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:saxon="http://saxon.sf.net/"
xmlns:my="myFunctions"
extension-element-prefixes="saxon my"
>
<xsl:function name="my:merge"><xsl:param name="a"/><xsl:value-of select="$a"/></xsl:function>
<xsl:template match="/">
<results>
<xsl:for-each select="//li[@class='g']">
<search-result>
<title><xsl:value-of select="h3/a"/></title>
<url><xsl:value-of select="h3/a/@href"/></url>
<snippets><xsl:value-of select="my:merge(div[@class='s']/node()[not(self::cite or self::span or self::br)])"/></snippets>
</search-result>
</xsl:for-each>
</results>
</xsl:template>
</xsl:stylesheet>