download:高级爬虫实战-零碎把握破解反爬技能无密

对网页中各种不同格局的公布工夫进行抽取,将公布工夫以规整的“yyyy-MM-dd HH:mm:ss”格局示意进去,只能尽量谋求准确,然而因为网络公布工夫的格局非常灵便,所以做不到百分百地正确抽取

package whu.extract.pubtime.core;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import whu.utils.TimeUtil;

/**

  • Created On 2014年3月13日 下午2:49:05
  • @description 获取网页的公布工夫
    */

public class FetchPubTime {

/** 示意url中间断的8位日期,例如http://www.baidu.com/20140311/2356.html */private static String url_reg_whole= "([-|/|_]{1}20\\d{6})";/** 示意 用-或者/隔开的日期,有年月日的,例如 http://www.baidu.com/2014-3-11/2356.html  */private static String url_reg_sep_ymd = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2})";/** 示意 用-或者/隔开的日期,只有年和月份的,例如 http://www.baidu.com/2014-3/2356.html  */private static String url_reg_sep_ym = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2})";private static Calendar current = Calendar.getInstance();/** 格局正确的工夫正则表达式*/private static String rightTimeReg = "^((\\d{2}(([02468][048])|([13579][26]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])))))|(\\d{2}(([02468][1235679])|([13579][01345789]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))(\\s(((0?[0-9])|([1-2][0-3]))\\:([0-5]?[0-9])((\\s)|(\\:([0-5]?[0-9])))))?$"; /** * @param url * @param urlContent * @return */public static String getPubTimeVarious(String url,String urlContent) {         String pubTime = getPubTimeFromUrl(url);       //链接外面没有,匹配文本中的    if(pubTime == null)    {        if(urlContent!=null&&!urlContent.trim().equals(""))            return extractPageDate(urlContent);    }         return pubTime;} /**从url外面抽取出公布工夫,返回YYYY-MM-DD HH:mm:ss格局的字符串 * @param url * @return */public static String getPubTimeFromUrl(String url){    Pattern p_whole = Pattern.compile(url_reg_whole);    Matcher m_whole = p_whole.matcher(url);    if(m_whole.find(0)&&m_whole.groupCount()>0)    {       String time =  m_whole.group(0);       time = time.substring(1,time.length());       //每一步都不可能超出以后工夫              if(current.compareTo(TimeUtil.strToCalendar(time, "yyyyMMdd"))>=0)    {       return time.substring(0,4)+"-"+time.substring(4,6)+"-"+              time.substring(6,8)+" "+"00:00:00";    }    }       p_whole = null;    m_whole = null;    Pattern p_sep = Pattern.compile(url_reg_sep_ymd);    Matcher m_sep = p_sep.matcher(url);    if(m_sep.find(0)&&m_sep.groupCount()>0)    {         String time =  m_sep.group(0);         time = time.substring(1,time.length());         String[] seg = time.split("[-|/|_]{1}");         Calendar theTime = Calendar.getInstance();         theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));         theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));         theTime.set(Calendar.DAY_OF_MONTH, Integer.parseInt(seg[2]));         if(current.compareTo(theTime)>=0)            {                 return seg[0]+"-"+seg[1]+"-"+seg[2]+" "+"00:00:00";            }    }    p_sep = null;    m_sep = null;    Pattern p_sep_ym = Pattern.compile(url_reg_sep_ym);    Matcher m_sep_ym = p_sep_ym.matcher(url);    if(m_sep_ym.find(0)&&m_sep_ym.groupCount()>0)    {         String time =  m_sep_ym.group(0);         time = time.substring(1,time.length());         Calendar theTime = Calendar.getInstance();         String[] seg = time.split("[-|/|_]{1}");         theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));         theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));         theTime.set(Calendar.DAY_OF_MONTH, 1);         if(current.compareTo(theTime)>=0)        {                  return seg[0]+"-"+seg[1]+"-"+"01"+" "+"00:00:00";        }    }         return null;}