import java.io.UnsupportedEncodingException; import java.net.URLEncoder; import java.util.ArrayList; import java.util.Iterator; import java.util.List; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.HttpStatus; import org.apache.http.client.HttpClient; import org.apache.http.client.methods.HttpGet; import org.apache.http.client.utils.URLEncodedUtils; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; public class WebContent { public static String AREA_LIST = "\\[list\\]"; public static String AREA_TITLE ="\\[title\\]"; public static String AREA_URL = "\\[url\\]"; public static String AREA_MESSAGE = "\\[message\\]"; public static String PAGE_URL_NUM = "\\[num\\]"; public static int PAGE_STYLE_LIST = 1; public static int PAGE_STYLE_NEXT = 2; /** * 给定一个列表url,得到本url对象的html,然后其它操作在这个基础上做 * @param url * @param encoding * @return * @throws Exception */ public String getHtml(String url, String encoding) throws Exception { String value = null; HttpClient httpclient = new DefaultHttpClient(); HttpGet httpget = new HttpGet(url); // 以下这条如果不加会发现无论你设置Accept-Charset为gbk还是utf-8,他都会默认返回gb2312(本例针对google.cn来说) httpget.setHeader("User-Agent", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.2)"); // 用逗号分隔显示可以同时接受多种编码 httpget.setHeader("Accept-Language", "zh-cn,zh;q=0.5"); httpget.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7"); HttpResponse response = httpclient.execute(httpget); // 判断页面返回状态判断是否进行转向抓取新链接 int statusCode = response.getStatusLine().getStatusCode(); if ((statusCode == HttpStatus.SC_MOVED_PERMANENTLY) || (statusCode == HttpStatus.SC_MOVED_TEMPORARILY) || (statusCode == HttpStatus.SC_SEE_OTHER) || (statusCode == HttpStatus.SC_TEMPORARY_REDIRECT)) { // 此处重定向处理 此处还未验证 String newUri = response.getLastHeader("Location").getValue(); httpclient = new DefaultHttpClient(); httpget = new HttpGet(newUri); response = httpclient.execute(httpget); } // Get hold of the response entity HttpEntity entity = response.getEntity(); // If the response does not enclose an entity, there is no need // to bother about connection release if (entity != null) { // 将源码流保存在一个byte数组当中,因为可能需要两次用到该流, byte[] bytes = EntityUtils.toByteArray(entity); // 如果头部Content-Type中包含了编码信息,那么我们可以直接在此处获取 if(encoding != null && !"".equals(encoding.trim())){ value = new String(bytes, encoding); }else{ String charSet = EntityUtils.getContentCharSet(entity); // 如果头部中没有,那么我们需要 查看页面源码,这个方法虽然不能说完全正确,因为有些粗糙的网页编码者没有在页面中写头部编码信息 if (charSet == null || "".equals(charSet.trim())) { String regEx="(?= getListUrl(String areaHtml,String regex,String withPrefix){ return this.getListUrl(areaHtml, regex, withPrefix, false, null); } /** * 提取给定区域的文章列表地址 * @param areaHtml * @param regex * @param withPrefix 返回的地址是否需要加一个前缀,如21cn的新闻,提取到后是没有前缀的,得加一个 * @param hasOmit 提取文章内分页时,是否有分页是否有省略号,如淘宝的就有。 * @param pageNumRegex 提取文章内的分页正则表达式 * @return */ public List getListUrl(String areaHtml,String regex,String withPrefix,boolean hasOmit,String pageNumRegex){ List list = new ArrayList(); regex = regex.replaceFirst(WebContent.AREA_URL, "(.*?)"); Pattern pa = Pattern.compile(regex, Pattern.MULTILINE); Matcher ma = pa.matcher(areaHtml); while (ma.find()) { if(withPrefix !=null && !"".equals(withPrefix.trim())){ list.add(withPrefix+ma.group(1).trim()); }else{ list.add(ma.group(1).trim()); } } if(hasOmit && list.size()>1){ String lasturl = list.get(list.size()-1); list.clear(); regex = "(.*?)"+pageNumRegex.replaceFirst(WebContent.PAGE_URL_NUM, "(-?\\\\d+)"); pa = Pattern.compile(regex); ma = pa.matcher(lasturl); if(ma.find()){ int num = Integer.parseInt(ma.group(2)); String prefix = ma.group(1).trim(); String baseurl = prefix+pageNumRegex; for(int i = 2; i <= num; i ++) list.add(baseurl.replaceFirst(WebContent.PAGE_URL_NUM, i+"")); } } return list; } /** * 这个仅用于淘宝中的url替换成我的搜索推广 * @param oriMessage * @param regex * @return */ public String messageUrlReplease(String oriMessage){ String regex = "(.*?)\\<"; Pattern pa = Pattern.compile(regex,Pattern.MULTILINE); Matcher ma = pa.matcher(oriMessage); String searchStr = null; if(ma.find()){ try { searchStr= URLEncoder.encode(ma.group(2), "GBK"); // System.out.println(ma.group(2)); } catch (UnsupportedEncodingException e) { // TODO Auto-generated catch block e.printStackTrace(); } } regex = "href=\"http://search8.taobao.com(.*?)\""; pa = Pattern.compile(regex,Pattern.MULTILINE); ma = pa.matcher(oriMessage); String repleaceStr = "http://z.alimama.com/tksEncrypt.php?q="+searchStr+"&cat=16&pid=mm_16462935_0_0&unid=&commend=all&search_type=auction&user_action=initiative&f=D9_5_1&at_topsearch=1&sid=%286e3f806637f4a4de24b5b7a43a41aefc%29&sort=&spercent=0&st=0"; while(ma.find()){ oriMessage = ma.replaceAll("href=\""+repleaceStr+"\""); } return oriMessage; } public static void main(String[] args){ //21cn // String url ="http://news.21cn.com/world/guojisaomiao/list1.shtml"; // String prefixUrl = "http://news.21cn.com"; // String encoding = "gb2312"; // String arearegex = "[list]
"; // String urlregex = ""; // String titleRegex = "

[title]

"; // String messageRegex = "[message]"; // boolean iSpagination = false; //taobao String url ="http://info.taobao.com/list/lady/23/30/2330b6ed-16ce-4d0e-b7fa-58a411e1871a_1.php"; String prefixUrl = ""; String encoding = "GBK"; String arearegex = "