网站邮箱email地址定向采集核心代码分享 | 乐趣区

邮箱采集demo：http://www.jsanai.com/emailco...
原理：

1、根据要采集的url地址，获取页面html内容，然后采用正则匹配出页面的url列表、邮箱地址列表。

2、获取到url列表及邮箱后分两个异步线程：

①保存邮箱地址；

②分析采集子页面url的邮箱地址；

核心源码（golang）：

//采集网站地址入口方法func CollectEmail(hosturl string) (EmailObj, []string, error) {    emailObj := new(EmailObj)    var inhost []string    //获取主域名    uparse, err := url.Parse(hosturl)    if err != nil {        return *emailObj, inhost, err    }    emailObj.Surl = hosturl    //    bodystr, err := HttpGetV2(hosturl)    if err != nil {        return *emailObj, inhost, errors.New("get request error")    }    //是否是gbk编码    pos := strings.Index(bodystr, "charset=gb")    pos2 := strings.Index(bodystr, "bg2312")    if pos != -1 || pos2 != -1 {        decodeBytes, err := simplifiedchinese.GB18030.NewDecoder().Bytes([]byte(bodystr))        if err != nil {            return *emailObj, inhost, errors.New("simplifiedchinese coding change error")        }        bodystr = string(decodeBytes)    }    //获取邮箱地地址    emailObj.Emails = append(emailObj.Emails, matchEmail(bodystr)...)    //获取联系手机    emailObj.Phones = append(emailObj.Phones, matchPhone(bodystr)...)    //获取内页链接列表    matchUrls := matchUrls(bodystr)    for _, item := range matchUrls {        itemparse, err := url.Parse(item)        if err != nil {            continue        }        if strings.Index(itemparse.Path, ".js") != -1 || strings.Index(itemparse.Path, ".css") != -1 {            continue        }        if itemparse.Host == uparse.Host {            inhost = append(inhost, item)        }        if itemparse.Scheme != "http" && itemparse.Scheme != "https" {            if strings.Index(itemparse.Path, "/") == 0 {                inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+itemparse.Path)            } else {                inhost = append(inhost, uparse.Scheme+"://"+uparse.Host+"/"+itemparse.Path)            }            continue        }    }    //获取内页email    inhost = RemoveRepeatedElement(inhost)    emailObj.Emails = RemoveRepeatedElement(emailObj.Emails)    return *emailObj, inhost, nil}func matchEmail(str string) (email []string) {    var emailList []string    //re, _ := regexp.Compile("\\<style[\\S\\s]+?\\</style\\>")    re, _ := regexp.Compile(`<style[\S\s]+?</style>`)    str = re.ReplaceAllString(str, "")    //re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>")    re, _ = regexp.Compile(`<script[\S\s]+?</script>`)    str = re.ReplaceAllString(str, "")    //替换html标签    re, _ = regexp.Compile(`<[^>]*?>`)    str = re.ReplaceAllString(str, "")    //只匹配com com.cn cn org org.cn net    reg := regexp.MustCompile(`\w+[@|#]{1}\w+\.(com|cn|org|net|org\.cn|com\.cn)`)    match := reg.FindAllStringSubmatch(str, -1)    for _, matched := range match {        emailList = append(emailList, strings.Replace(strings.ToLower(matched[0]), "#", "@", -1))    }    return emailList[:]}func matchUrls(str string) (urls []string) {    var urlList []string    reg := regexp.MustCompile("<a[^>]*?href=[\"|']+([^\"]*?)[\"|'][^>]*?>[^<]*?</a>")    match := reg.FindAllStringSubmatch(str, -1)    for _, matched := range match {        urlList = append(urlList, matched[1])    }    return urlList[:]}

核心代码使用golang实现，有近6个月的实际使用及改进时间，请放心。
其中涉及到爬虫相关内容，由于当前大部分网站都有反爬虫协议，请大家在使用的时候多加注意