在上面网址中最后874589732820为每次抓取要替换的参数
package ups.test;
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
public class Test {
public String getPageContent(String strUrl, String strPostRequest,int maxLength) { // 读取结果网页 StringBuffer buffer = new StringBuffer(); System.setProperty("sun.net.client.defaultConnectTimeout", "5000"); System.setProperty("sun.net.client.defaultReadTimeout", "5000"); try { URL newUrl = new URL(strUrl); HttpURLConnection hConnect = (HttpURLConnection) newUrl.openConnection(); // POST方式的额外数据 if (strPostRequest.length() > 0) { hConnect.setDoOutput(true); OutputStreamWriter out = new OutputStreamWriter(hConnect.getOutputStream()); out.write(strPostRequest); out.flush(); out.close(); } // 读取内容 BufferedReader rd = new BufferedReader(new InputStreamReader(hConnect.getInputStream(),"utf-8")); int ch; for (int length = 0; (ch = rd.read()) > -1 && (maxLength <= 0 || length < maxLength); length++) buffer.append((char) ch); String s = buffer.toString(); s.replaceAll("//&[a-zA-Z]{1,10};", "").replaceAll("<[^>]*>", ""); System.out.println(s); rd.close(); hConnect.disconnect(); return buffer.toString().trim(); } catch (Exception e) { return "错误:读取网页失败!"; // } }
}
package ups.test;
public class Test1 {
public static void main(String[] args) { String url = "http://www.fedex.com/Tracking?clienttype=dotcomreg&ascend_header=1&cntry_code=cn&language=sim&mi=n&tracknumbers=874589732820"; Test p = new Test(); p.getPageContent(url, "post", 100500); System.out.print("已经执行!"); }
}
现在能够抓取到网页的所有代码,但需要货件托运历史中里面的信息:日期/时间 活动 地点 详细信息 不知道怎么提取 求指导
RISEBY
相关分类