TestSeastarAlibaba.java : 使用海星+XSLT模板+Saxon+XStream抽取阿里巴巴搜索结果 下载
package test.xslt;
import java.io.FileWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;
import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.IOUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.DocumentSource;
import com.thoughtworks.xstream.XStream;
import com.zhsoft88.commons.Seastar;
import com.zhsoft88.commons.Seastar.SeastarResult;
/**
* Test of Seastar: alibaba search results analysis
* @author zhsoft88
* @since 2008-08-25
*/
public class TestSeastarAlibaba {
public static class SearchResult {
private String company;
private String url;
private String verify;
private String description;
private String type;
private String business;
public SearchResult() {
}
public String getCompany() {
return company;
}
public void setCompany(String company) {
this.company = company;
}
public String getUrl() {
return url;
}
public void setUrl(String url) {
this.url = url;
}
public String getVerify() {
return verify;
}
public void setVerify(String verify) {
this.verify = verify;
}
public String getDescription() {
return description;
}
public void setDescription(String description) {
this.description = description;
}
public String getType() {
return type;
}
public void setType(String type) {
this.type = type;
}
public String getBusiness() {
return business;
}
public void setBusiness(String business) {
this.business = business;
}
@Override
public String toString() {
return "SearchResult[company="+company+",url="+url+",verify="+verify+",description="+description+",type="+type+",business="+business+"]";
}
}
/**
* @param args
*/
public static void main(String[] args) throws Exception {
System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl");
String url = "http://search.china.alibaba.com/search/company_search.htm?tracelog=po_searchcompany_select_bf&tracelog=&keywords=%BC%D2%BE%D3%D3%C3%C6%B7&submit=+%D6%D8%D0%C2%CB%D1%CB%F7+";
// 1. use httpclient, get html source
HttpClient client = new HttpClient();
client.getParams().setSoTimeout(3*60*1000);
client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
List<Header> defaultHeaders = new ArrayList<Header>();
defaultHeaders.add(new Header("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1) Gecko/20061010"));
defaultHeaders.add(new Header("Connection","keep-alive"));
defaultHeaders.add(new Header("Keep-Alive","300"));
client.getParams().setParameter("http.default-headers", defaultHeaders);
GetMethod get = new GetMethod(url);
client.executeMethod(get);
String origContent = IOUtils.toString(get.getResponseBodyAsStream(),"gbk");
// 2. use seastar, transform html to dom (xml)
Seastar ss = new Seastar();
SeastarResult result = ss.structString(origContent);
Document doc = DocumentHelper.parseText(result.getContents());
// 3. use saxon/xslt, extract all search result
TransformerFactory factory = TransformerFactory.newInstance();
Transformer transformer = factory.newTransformer( new StreamSource(TestSeastarAlibaba.class.getResourceAsStream("alibaba.xsl")) );
DocumentSource source = new DocumentSource( doc );
source.setSystemId(url);
StringWriter writer = new StringWriter();
StreamResult res = new StreamResult(writer);
transformer.transform( source, res );
// 4. use xstream, transform xml to objects
XStream xs = new XStream();
xs.alias("search-result", SearchResult.class);
Document resultDom = DocumentHelper.parseText(writer.toString());
List<Element> list = resultDom.selectNodes("//search-result");
for (Element e : list) {
SearchResult item = (SearchResult)xs.fromXML(e.asXML());
System.out.println(item);
}
}
}
alibaba.xsl : XSLT模板 下载
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:saxon="http://saxon.sf.net/"
xmlns:my="myFunctions"
extension-element-prefixes="saxon my"
>
<xsl:function name="my:merge"><xsl:param name="a"/><xsl:value-of select="$a"/></xsl:function>
<xsl:template match="/">
<results>
<xsl:for-each select="//div[@class=' offer']">
<search-result>
<company><xsl:value-of select=".//div[@class='info']/span/a"/></company>
<url><xsl:value-of select=".//div[@class='info']/span/a/@href"/></url>
<verify><xsl:value-of select=".//div[@class='info']/span[2]"/></verify>
<description><xsl:value-of select=".//div[@class='info']/span[3]"/></description>
<type><xsl:value-of select=".//div[@class='info']/text()[string-length(normalize-space(.))>0]"/></type>
<business><xsl:value-of select=".//div[@class='info']/span[4]"/></business>
</search-result>
</xsl:for-each>
</results>
</xsl:template>
</xsl:stylesheet>