关于爬虫:高级爬虫实战系统掌握破解反爬技能

31次阅读

共计 3288 个字符,预计需要花费 9 分钟才能阅读完成。

download:高级爬虫实战 - 零碎把握破解反爬技能无密

对网页中各种不同格局的公布工夫进行抽取,将公布工夫以规整的“yyyy-MM-dd HH:mm:ss”格局示意进去,只能尽量谋求准确,然而因为网络公布工夫的格局非常灵便,所以做不到百分百地正确抽取

package whu.extract.pubtime.core;

import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collections;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import whu.utils.TimeUtil;

/**

  • Created On 2014 年 3 月 13 日 下午 2:49:05
  • @description 获取网页的公布工夫
    */

public class FetchPubTime {

/** 示意 url 中间断的 8 位日期,例如 http://www.baidu.com/20140311/2356.html */
private static String url_reg_whole= "([-|/|_]{1}20\\d{6})";
/** 示意 用 - 或者 / 隔开的日期, 有年月日的,例如 http://www.baidu.com/2014-3-11/2356.html  */
private static String url_reg_sep_ymd = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2}[-|/|_]{1}\\d{1,2})";
/** 示意 用 - 或者 / 隔开的日期, 只有年和月份的,例如 http://www.baidu.com/2014-3/2356.html  */
private static String url_reg_sep_ym = "([-|/|_]{1}20\\d{2}[-|/|_]{1}\\d{1,2})";
private static Calendar current = Calendar.getInstance();
/** 格局正确的工夫正则表达式 */
private static String rightTimeReg = "^((\\d{2}(([02468][048])|([13579][26]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])))))|(\\d{2}(([02468][1235679])|([13579][01345789]))[\\-\\/\\s]?((((0?[13578])|(1[02]))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(3[01])))|(((0?[469])|(11))[\\-\\/\\s]?((0?[1-9])|([1-2][0-9])|(30)))|(0?2[\\-\\/\\s]?((0?[1-9])|(1[0-9])|(2[0-8]))))))(\\s(((0?[0-9])|([1-2][0-3]))\\:([0-5]?[0-9])((\\s)|(\\:([0-5]?[0-9])))))?$";
 
/**
 * @param url
 * @param urlContent
 * @return
 */
public static String getPubTimeVarious(String url,String urlContent) {String pubTime = getPubTimeFromUrl(url);
     
  // 链接外面没有,匹配文本中的
    if(pubTime == null)
    {if(urlContent!=null&&!urlContent.trim().equals(""))
            return extractPageDate(urlContent);
    }
     
    return pubTime;
}
 
/** 从 url 外面抽取出公布工夫,返回 YYYY-MM-DD HH:mm:ss 格局的字符串
 * @param url
 * @return
 */
public static String getPubTimeFromUrl(String url)
{Pattern p_whole = Pattern.compile(url_reg_whole);
    Matcher m_whole = p_whole.matcher(url);
    if(m_whole.find(0)&&m_whole.groupCount()>0)
    {String time =  m_whole.group(0);
       time = time.substring(1,time.length());
       // 每一步都不可能超出以后工夫          
    if(current.compareTo(TimeUtil.strToCalendar(time, "yyyyMMdd"))>=0)
    {return time.substring(0,4)+"-"+time.substring(4,6)+"-"+
              time.substring(6,8)+""+"00:00:00";
    }
    }
   
    p_whole = null;
    m_whole = null;
    Pattern p_sep = Pattern.compile(url_reg_sep_ymd);
    Matcher m_sep = p_sep.matcher(url);
    if(m_sep.find(0)&&m_sep.groupCount()>0)
    {String time =  m_sep.group(0);
         time = time.substring(1,time.length());
         String[] seg = time.split("[-|/|_]{1}");
         Calendar theTime = Calendar.getInstance();
         theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
         theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));
         theTime.set(Calendar.DAY_OF_MONTH, Integer.parseInt(seg[2]));
         if(current.compareTo(theTime)>=0)
            {return seg[0]+"-"+seg[1]+"-"+seg[2]+""+"00:00:00";
            }
    }
    p_sep = null;
    m_sep = null;
    Pattern p_sep_ym = Pattern.compile(url_reg_sep_ym);
    Matcher m_sep_ym = p_sep_ym.matcher(url);
    if(m_sep_ym.find(0)&&m_sep_ym.groupCount()>0)
    {String time =  m_sep_ym.group(0);
         time = time.substring(1,time.length());
         Calendar theTime = Calendar.getInstance();
         String[] seg = time.split("[-|/|_]{1}");
         theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
         theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));
         theTime.set(Calendar.DAY_OF_MONTH, 1);
         if(current.compareTo(theTime)>=0)
        {return seg[0]+"-"+seg[1]+"-"+"01"+""+"00:00:00";
        }
    }
     
    return null;
}

正文完
 0