最近用golang采集网页中遇到了各种不能辨认的的乱码字符串,他们大多编码是gbk、gb2312、big5、windows-1252 等编码。有时候,网页上并没有申明编码,却应用下面这种编码的网页也有,也有网页申明的编码和理论应用的编码不同的网页,导致网页编码转换工作带来诸多不便,更多的是依据提醒的编码转换进去仍然还是乱码的问题,着实让人头疼。于是乎,为了失去一个通用可行的中文字符串编码转换方法,自己通过网络上上百万个网站测试,采集数据回来进行编码转换,终于总结进去了一套绝大部分都能顺利将网页中文字符串编码都转换成utf-8编码的办法。

golang我的项目间接援用

装置依赖包

go get github.com/fesiong/goproject/convert

应用阐明

对外公开有3个函数,Request函数反对申请网络页面,并自动检测页面内容的编码,转换成utf-8,ToUtf8函数反对传入的字符串会自动检测编码,并转换成utf-8,Convert函数须要传入原始编码和输入编码,如果原始编码传入出错,则转换进去的文本会乱码

申请网络页面,并自动检测页面内容的编码,转换成utf-8

link := "http://www.youth.cn/"resp, err := Request(link)if err != nil {t.Error(err.Error())}

传入的字符串会自动检测编码,并转换成utf-8

content := "中国青年网"content = ToUtf8(content)

传入原始编码和输入编码

content := "中国青年网"content = Convert(content, "utf-8", "utf-8")

源码地址

github.com/fesiong/goproject

外围的编码转换判断函数

package convertimport ("github.com/axgle/mahonia""github.com/parnurzeal/gorequest""golang.org/x/net/html/charset""net/http""regexp""strings""time")type RequestData struct {Header http.HeaderRequest *http.RequestBody stringStatus stringStatusCode int}/*** 申请网络页面,并自动检测页面内容的编码,转换成utf-8*/func Request(urlPath string) (*RequestData, error) {resp, body, errs := gorequest.New().Timeout(90 * time.Second).Get(urlPath).End()if len(errs) > 0 {//如果是https,则尝试退回http申请if strings.HasPrefix(urlPath, "https") {urlPath = strings.Replace(urlPath, "https://", "http://", 1)return Request(urlPath)}return nil, errs[0]}defer resp.Body.Close()contentType := strings.ToLower(resp.Header.Get("Content-Type"))body = toUtf8(body, contentType)requestData := RequestData{Header: resp.Header,Request: resp.Request,Body: body,Status: resp.Status,StatusCode: resp.StatusCode,}return &requestData, nil}/*** 对外公开的编码转换接口,传入的字符串会自动检测编码,并转换成utf-8*/func ToUtf8(content string) string {return toUtf8(content, "")}/*** 外部编码判断和转换,会主动判断传入的字符串编码,并将它转换成utf-8*/func toUtf8(content string, contentType string) string {var htmlEncode stringif strings.Contains(contentType, "gbk") || strings.Contains(contentType, "gb2312") || strings.Contains(contentType, "gb18030") || strings.Contains(contentType, "windows-1252") {htmlEncode = "gb18030"} else if strings.Contains(contentType, "big5") {htmlEncode = "big5"} else if strings.Contains(contentType, "utf-8") {htmlEncode = "utf-8"}if htmlEncode == "" {//先尝试读取charsetreg := regexp.MustCompile(`(?is)<meta[^>]*charsets*=["']?s*([A-Za-z0-9-]+)`)match := reg.FindStringSubmatch(content)if len(match) > 1 {contentType = strings.ToLower(match[1])if strings.Contains(contentType, "gbk") || strings.Contains(contentType, "gb2312") || strings.Contains(contentType, "gb18030") || strings.Contains(contentType, "windows-1252") {htmlEncode = "gb18030"} else if strings.Contains(contentType, "big5") {htmlEncode = "big5"} else if strings.Contains(contentType, "utf-8") {htmlEncode = "utf-8"}}if htmlEncode == "" {reg = regexp.MustCompile(`(?is)<title[^>]*>(.*?)</title>`)match = reg.FindStringSubmatch(content)if len(match) > 1 {aa := match[1]_, contentType, _ = charset.DetermineEncoding([]byte(aa), "")htmlEncode = strings.ToLower(htmlEncode)if strings.Contains(contentType, "gbk") || strings.Contains(contentType, "gb2312") || strings.Contains(contentType, "gb18030") || strings.Contains(contentType, "windows-1252") {htmlEncode = "gb18030"} else if strings.Contains(contentType, "big5") {htmlEncode = "big5"} else if strings.Contains(contentType, "utf-8") {htmlEncode = "utf-8"}}}}if htmlEncode != "" && htmlEncode != "utf-8" {content = Convert(content, htmlEncode, "utf-8")}return content}/*** 编码转换* 须要传入原始编码和输入编码,如果原始编码传入出错,则转换进去的文本会乱码*/func Convert(src string, srcCode string, tagCode string) string {srcCoder := mahonia.NewDecoder(srcCode)srcResult := srcCoder.ConvertString(src)tagCoder := mahonia.NewDecoder(tagCode)_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)result := string(cdata)return result}