English

使用海星+XSLT模板,抽取阿里巴巴搜索结果相当简单

  1. 使用FIREBUG, 分析搜索结果的出现规律,用XPATH表示出来 : //div[@class='offer']

  2. 使用XPather, 检查所得到的XPATH是否正确

  3. 使用XPather, 分析公司名称及链接

  4. 使用XPather, 分析验证标记

  5. 使用XPather, 获取公司描述

  6. 使用XPather, 获取主营业务

示例代码

TestSeastarAlibaba.java : 使用海星+XSLT模板+Saxon+XStream抽取阿里巴巴搜索结果 下载

package test.xslt;

import java.io.FileWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;

import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.IOUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.DocumentSource;

import com.thoughtworks.xstream.XStream;
import com.zhsoft88.commons.Seastar;
import com.zhsoft88.commons.Seastar.SeastarResult;

/**
 * Test of Seastar: alibaba search results analysis 
 * @author zhsoft88
 * @since 2008-08-25
 */
public class TestSeastarAlibaba {

	public static class SearchResult {
		
		private String company;
		private String url;
		private String verify;
		private String description;
		private String type;
		private String business;
		
		public SearchResult() {
			
		}

		public String getCompany() {
			return company;
		}

		public void setCompany(String company) {
			this.company = company;
		}

		public String getUrl() {
			return url;
		}

		public void setUrl(String url) {
			this.url = url;
		}

		public String getVerify() {
			return verify;
		}

		public void setVerify(String verify) {
			this.verify = verify;
		}

		public String getDescription() {
			return description;
		}

		public void setDescription(String description) {
			this.description = description;
		}

		public String getType() {
			return type;
		}

		public void setType(String type) {
			this.type = type;
		}

		public String getBusiness() {
			return business;
		}

		public void setBusiness(String business) {
			this.business = business;
		}

		@Override
		public String toString() {
			return "SearchResult[company="+company+",url="+url+",verify="+verify+",description="+description+",type="+type+",business="+business+"]";
		}
	}
	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl");
		String url = "http://search.china.alibaba.com/search/company_search.htm?tracelog=po_searchcompany_select_bf&tracelog=&keywords=%BC%D2%BE%D3%D3%C3%C6%B7&submit=+%D6%D8%D0%C2%CB%D1%CB%F7+";
		// 1. use httpclient, get html source
		HttpClient client = new HttpClient();
		client.getParams().setSoTimeout(3*60*1000);
		client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
		List<Header> defaultHeaders = new ArrayList<Header>();
		defaultHeaders.add(new Header("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1) Gecko/20061010"));
		defaultHeaders.add(new Header("Connection","keep-alive"));
		defaultHeaders.add(new Header("Keep-Alive","300"));
		client.getParams().setParameter("http.default-headers", defaultHeaders);
		GetMethod get = new GetMethod(url);
		client.executeMethod(get);
		String origContent = IOUtils.toString(get.getResponseBodyAsStream(),"gbk");
		// 2. use seastar, transform html to dom (xml)
		Seastar ss = new Seastar();
		SeastarResult result = ss.structString(origContent);
		Document doc = DocumentHelper.parseText(result.getContents());
		// 3. use saxon/xslt, extract all search result
		TransformerFactory factory = TransformerFactory.newInstance();
		Transformer transformer = factory.newTransformer( new StreamSource(TestSeastarAlibaba.class.getResourceAsStream("alibaba.xsl")) );
		DocumentSource source = new DocumentSource( doc );
		source.setSystemId(url);
		StringWriter writer = new StringWriter();
		StreamResult res = new StreamResult(writer);
		transformer.transform( source, res );
		// 4. use xstream, transform xml to objects
		XStream xs = new XStream();
		xs.alias("search-result", SearchResult.class);
		Document resultDom = DocumentHelper.parseText(writer.toString());
		List<Element> list = resultDom.selectNodes("//search-result");
		for (Element e : list) {
			SearchResult item = (SearchResult)xs.fromXML(e.asXML());
			System.out.println(item);
		}
	}

}

alibaba.xsl : XSLT模板 下载

<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:saxon="http://saxon.sf.net/"
    xmlns:my="myFunctions"
    extension-element-prefixes="saxon my"
    >
    <xsl:function name="my:merge"><xsl:param name="a"/><xsl:value-of select="$a"/></xsl:function>
    <xsl:template match="/">
    <results>
    <xsl:for-each select="//div[@class=' offer']">
        <search-result>
            <company><xsl:value-of select=".//div[@class='info']/span/a"/></company>
            <url><xsl:value-of select=".//div[@class='info']/span/a/@href"/></url>
            <verify><xsl:value-of select=".//div[@class='info']/span[2]"/></verify>
            <description><xsl:value-of select=".//div[@class='info']/span[3]"/></description>
            <type><xsl:value-of select=".//div[@class='info']/text()[string-length(normalize-space(.))>0]"/></type>
            <business><xsl:value-of select=".//div[@class='info']/span[4]"/></business>
        </search-result>
    </xsl:for-each>
    </results>
</xsl:template>
</xsl:stylesheet>

产品族: 海狮 海猫 海葵 海蛛 海鹞 海星 海狗 WBXL Xultray webapp
iDocSet iDocSetHelper 雨燕 templateJS skiafy tranid 犀利播放器 犀利助手 网址导航 原创歌曲
(C) 2024 抓糖网 版权所有

update: 2013-06-07