download:高级爬虫实战 - 零碎把握破解反爬技能无密
对网页中各种不同格局的公布工夫进行抽取,将公布工夫以规整的“yyyy-MM-dd HH:mm:ss”格局示意进去,只能尽量谋求准确,然而因为网络公布工夫的格局非常灵便,所以做不到百分百地正确抽取
package whu.extract.pubtime.core;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import whu.utils.TimeUtil;
/**
- Created On 2014 年 3 月 13 日 下午 2:49:05
- @description 获取网页的公布工夫
*/
public class FetchPubTime {
/** 示意 url 中间断的 8 位日期,例如 http://www.baidu.com/20140311/2356.html */
private static String url_reg_whole= "([-|/|_]{1}20\\d{6})";
/** 示意 用 - 或者 / 隔开的日期, 有年月日的,例如 http://www.baidu.com/2014-3-11/2356.html */
private static String url_reg_sep_ymd = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2})";
/** 示意 用 - 或者 / 隔开的日期, 只有年和月份的,例如 http://www.baidu.com/2014-3/2356.html */
private static String url_reg_sep_ym = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2})";
private static Calendar current = Calendar.getInstance();
/** 格局正确的工夫正则表达式 */
private static String rightTimeReg = "^((\\d{2}(([02468][048])|([13579][26]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])))))|(\\d{2}(([02468][1235679])|([13579][01345789]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))(\\s(((0?[0-9])|([1-2][0-3]))\\:([0-5]?[0-9])((\\s)|(\\:([0-5]?[0-9])))))?$";
/**
* @param url
* @param urlContent
* @return
*/
public static String getPubTimeVarious(String url,String urlContent) {String pubTime = getPubTimeFromUrl(url);
// 链接外面没有,匹配文本中的
if(pubTime == null)
{if(urlContent!=null&&!urlContent.trim().equals(""))
return extractPageDate(urlContent);
}
return pubTime;
}
/** 从 url 外面抽取出公布工夫,返回 YYYY-MM-DD HH:mm:ss 格局的字符串
* @param url
* @return
*/
public static String getPubTimeFromUrl(String url)
{Pattern p_whole = Pattern.compile(url_reg_whole);
Matcher m_whole = p_whole.matcher(url);
if(m_whole.find(0)&&m_whole.groupCount()>0)
{String time = m_whole.group(0);
time = time.substring(1,time.length());
// 每一步都不可能超出以后工夫
if(current.compareTo(TimeUtil.strToCalendar(time, "yyyyMMdd"))>=0)
{return time.substring(0,4)+"-"+time.substring(4,6)+"-"+
time.substring(6,8)+""+"00:00:00";
}
}
p_whole = null;
m_whole = null;
Pattern p_sep = Pattern.compile(url_reg_sep_ymd);
Matcher m_sep = p_sep.matcher(url);
if(m_sep.find(0)&&m_sep.groupCount()>0)
{String time = m_sep.group(0);
time = time.substring(1,time.length());
String[] seg = time.split("[-|/|_]{1}");
Calendar theTime = Calendar.getInstance();
theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));
theTime.set(Calendar.DAY_OF_MONTH, Integer.parseInt(seg[2]));
if(current.compareTo(theTime)>=0)
{return seg[0]+"-"+seg[1]+"-"+seg[2]+""+"00:00:00";
}
}
p_sep = null;
m_sep = null;
Pattern p_sep_ym = Pattern.compile(url_reg_sep_ym);
Matcher m_sep_ym = p_sep_ym.matcher(url);
if(m_sep_ym.find(0)&&m_sep_ym.groupCount()>0)
{String time = m_sep_ym.group(0);
time = time.substring(1,time.length());
Calendar theTime = Calendar.getInstance();
String[] seg = time.split("[-|/|_]{1}");
theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));
theTime.set(Calendar.DAY_OF_MONTH, 1);
if(current.compareTo(theTime)>=0)
{return seg[0]+"-"+seg[1]+"-"+"01"+""+"00:00:00";
}
}
return null;
}