中文

Using Seastar and XSLT template, extract alibaba search results made easy

  1. Using FIREBUG, get rules of search results : //div[@class='offer']

  2. Using XPather, check xpath if valid

  3. Using XPather, analysis company name and url

  4. Using XPather, get verify flag

  5. Using XPather, get company description

  6. Using XPather, get main business

Example Code

TestSeastarAlibaba.java : using Seastar, XSLT template, Saxon processor and XStream, extract search results Download

package test.xslt;

import java.io.FileWriter;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.List;

import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.stream.StreamResult;
import javax.xml.transform.stream.StreamSource;

import org.apache.commons.httpclient.Header;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.io.IOUtils;
import org.dom4j.Document;
import org.dom4j.DocumentHelper;
import org.dom4j.Element;
import org.dom4j.io.DocumentSource;

import com.thoughtworks.xstream.XStream;
import com.zhsoft88.commons.Seastar;
import com.zhsoft88.commons.Seastar.SeastarResult;

/**
 * Test of Seastar: alibaba search results analysis 
 * @author zhsoft88
 * @since 2008-08-25
 */
public class TestSeastarAlibaba {

	public static class SearchResult {
		
		private String company;
		private String url;
		private String verify;
		private String description;
		private String type;
		private String business;
		
		public SearchResult() {
			
		}

		public String getCompany() {
			return company;
		}

		public void setCompany(String company) {
			this.company = company;
		}

		public String getUrl() {
			return url;
		}

		public void setUrl(String url) {
			this.url = url;
		}

		public String getVerify() {
			return verify;
		}

		public void setVerify(String verify) {
			this.verify = verify;
		}

		public String getDescription() {
			return description;
		}

		public void setDescription(String description) {
			this.description = description;
		}

		public String getType() {
			return type;
		}

		public void setType(String type) {
			this.type = type;
		}

		public String getBusiness() {
			return business;
		}

		public void setBusiness(String business) {
			this.business = business;
		}

		@Override
		public String toString() {
			return "SearchResult[company="+company+",url="+url+",verify="+verify+",description="+description+",type="+type+",business="+business+"]";
		}
	}
	/**
	 * @param args
	 */
	public static void main(String[] args) throws Exception {
		System.setProperty("javax.xml.transform.TransformerFactory", "net.sf.saxon.TransformerFactoryImpl");
		String url = "http://search.china.alibaba.com/search/company_search.htm?tracelog=po_searchcompany_select_bf&tracelog=&keywords=%BC%D2%BE%D3%D3%C3%C6%B7&submit=+%D6%D8%D0%C2%CB%D1%CB%F7+";
		// 1. use httpclient, get html source
		HttpClient client = new HttpClient();
		client.getParams().setSoTimeout(3*60*1000);
		client.getParams().setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY);
		List<Header> defaultHeaders = new ArrayList<Header>();
		defaultHeaders.add(new Header("User-Agent","Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.8.1) Gecko/20061010"));
		defaultHeaders.add(new Header("Connection","keep-alive"));
		defaultHeaders.add(new Header("Keep-Alive","300"));
		client.getParams().setParameter("http.default-headers", defaultHeaders);
		GetMethod get = new GetMethod(url);
		client.executeMethod(get);
		String origContent = IOUtils.toString(get.getResponseBodyAsStream(),"gbk");
		// 2. use seastar, transform html to dom (xml)
		Seastar ss = new Seastar();
		SeastarResult result = ss.structString(origContent);
		Document doc = DocumentHelper.parseText(result.getContents());
		// 3. use saxon/xslt, extract all search result
		TransformerFactory factory = TransformerFactory.newInstance();
		Transformer transformer = factory.newTransformer( new StreamSource(TestSeastarAlibaba.class.getResourceAsStream("alibaba.xsl")) );
		DocumentSource source = new DocumentSource( doc );
		source.setSystemId(url);
		StringWriter writer = new StringWriter();
		StreamResult res = new StreamResult(writer);
		transformer.transform( source, res );
		// 4. use xstream, transform xml to objects
		XStream xs = new XStream();
		xs.alias("search-result", SearchResult.class);
		Document resultDom = DocumentHelper.parseText(writer.toString());
		List<Element> list = resultDom.selectNodes("//search-result");
		for (Element e : list) {
			SearchResult item = (SearchResult)xs.fromXML(e.asXML());
			System.out.println(item);
		}
	}

}

alibaba.xsl : XSLT template Download

<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="2.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
    xmlns:saxon="http://saxon.sf.net/"
    xmlns:my="myFunctions"
    extension-element-prefixes="saxon my"
    >
    <xsl:function name="my:merge"><xsl:param name="a"/><xsl:value-of select="$a"/></xsl:function>
    <xsl:template match="/">
    <results>
    <xsl:for-each select="//div[@class=' offer']">
        <search-result>
            <company><xsl:value-of select=".//div[@class='info']/span/a"/></company>
            <url><xsl:value-of select=".//div[@class='info']/span/a/@href"/></url>
            <verify><xsl:value-of select=".//div[@class='info']/span[2]"/></verify>
            <description><xsl:value-of select=".//div[@class='info']/span[3]"/></description>
            <type><xsl:value-of select=".//div[@class='info']/text()[string-length(normalize-space(.))>0]"/></type>
            <business><xsl:value-of select=".//div[@class='info']/span[4]"/></business>
        </search-result>
    </xsl:for-each>
    </results>
</xsl:template>
</xsl:stylesheet>

Products: Sealion Seacat Seaflower Seaspider Seasnipe Seastar Seadog Jiong WBXL Xultray webapp
iDocSet iDocSetHelper Blink templateJS skiafy tranid xiliplayer xilihelper i.zhuatang 原创歌曲
(C) 2024 ZHUATANG.COM, All rights reserved

update: 2009-06-14