关于爬虫:高级爬虫实战系统掌握破解反爬技能

download:高级爬虫实战-零碎把握破解反爬技能 挑战高薪

public static String getPubTimeVarious(String url,String urlContent) {

     
    String pubTime = getPubTimeFromUrl(url);
     
  //链接外面没有,匹配文本中的
    if(pubTime == null)
    {
        if(urlContent!=null&&!urlContent.trim().equals(""))
            return extractPageDate(urlContent);
    }
     
    return pubTime;
}
 
/**从url外面抽取出公布工夫,返回YYYY-MM-DD HH:mm:ss格局的字符串
 * @param url
 * @return
 */
public static String getPubTimeFromUrl(String url)
{
    Pattern p_whole = Pattern.compile(url_reg_whole);
    Matcher m_whole = p_whole.matcher(url);
    if(m_whole.find(0)&&m_whole.groupCount()>0)
    {
       String time =  m_whole.group(0);
       time = time.substring(1,time.length());
       //每一步都不可能超出以后工夫          
    if(current.compareTo(TimeUtil.strToCalendar(time, "yyyyMMdd"))>=0)
    {

       return time.substring(0,4)+"-"+time.substring(4,6)+"-"+
              time.substring(6,8)+" "+"00:00:00";
    }
    }
   
    p_whole = null;
    m_whole = null;
    Pattern p_sep = Pattern.compile(url_reg_sep_ymd);
    Matcher m_sep = p_sep.matcher(url);
    if(m_sep.find(0)&&m_sep.groupCount()>0)
    {
         String time =  m_sep.group(0);
         time = time.substring(1,time.length());
         String[] seg = time.split("[-|/|_]{1}");
         Calendar theTime = Calendar.getInstance();
         theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
         theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));
         theTime.set(Calendar.DAY_OF_MONTH, Integer.parseInt(seg[2]));
         if(current.compareTo(theTime)>=0)
            {
         
        return seg[0]+"-"+seg[1]+"-"+seg[2]+" "+"00:00:00";
            }
    }
    p_sep = null;
    m_sep = null;
    Pattern p_sep_ym = Pattern.compile(url_reg_sep_ym);
    Matcher m_sep_ym = p_sep_ym.matcher(url);
    if(m_sep_ym.find(0)&&m_sep_ym.groupCount()>0)
    {
         String time =  m_sep_ym.group(0);
         time = time.substring(1,time.length());
         Calendar theTime = Calendar.getInstance();
         String[] seg = time.split("[-|/|_]{1}");
         theTime.set(Calendar.YEAR,Integer.parseInt(seg[0]));
         theTime.set(Calendar.MONTH, Integer.parseInt(seg[1]));
         theTime.set(Calendar.DAY_OF_MONTH, 1);
         if(current.compareTo(theTime)>=0)
        {
          
        return seg[0]+"-"+seg[1]+"-"+"01"+" "+"00:00:00";
        }
    }
     
    return null;
}

评论

发表回复

您的邮箱地址不会被公开。 必填项已用 * 标注

这个站点使用 Akismet 来减少垃圾评论。了解你的评论数据如何被处理