乐趣区

关于算法:手撸golang-基本数据结构与算法-网页排名PageRank随机游走

缘起

最近浏览 << 我的第一本算法书 >>(【日】石田保辉;宫崎修一)
本系列笔记拟采纳 golang 练习之

网页排名 (PageRank/ 佩奇排名), 随机游走

 网页排名(PageRank,也叫作佩奇排名)是一种在搜寻网页时对搜寻后果进行排序的算法。网页排名就是利用网页之间的链接构造计算出网页价值的算法。在网页排名中,链入页面越多的网页,它的重要性也就越高。假如没有链入页面的网页权重为 1。有链入页面的网页权重是其链入页面的权重之和。如果一个网页链向多个页面,那么其链向的所有页面将平分它的权重。在网页排名中,链入的页面越多,该网页所收回的链接的价值就越高。能够应用“随机游走模型”(random walk model)来解决网页互链的问题.

用户浏览网页的操作就能够这样来定义:用户等概率跳转到以后网页所链向的一个网页的概率为 1 -α;等概率近程跳转到其余网页中的一个网页的概率为 α。模仿用户随机拜访页面的过程, 
每拜访一个页面, 其权重加 1,
直到拜访的总次数达到 N 次为止,
每个页面的权重值代表的是“某一刻正在浏览这个网页的概率”,可间接将其作为网页的权重来应用。摘自 << 我的第一本算法书 >>【日】石田保辉;宫崎修一 

指标

  • 实现基于随机游走模型的 PageRank 算法, 并验证其有效性和稳定性 (网页权重在模仿若干次后, 保持稳定)

设计

  • IPage: 网页模型接口
  • IPageRanking: 网页排名算法接口
  • tPage: 网页模型的实现
  • tRandomWalkPageRanking: 基于随机游走模型的 PageRank 算法, 实现 IPageRanking 接口

单元测试

  • page_rank_test.go, 验证网页排名算法的有效性和稳定性
  • 首先通过简略 case 验证其有效性
  • 而后随机生成大批量随机互链的网页, 验证在多轮随机游走后, 网页权重的稳定性
package others

import (
    "fmt"
    pr "learning/gooop/others/page_rank"
    "math/rand"
    "sort"
    "testing"
    "time"
)

func Test_PageRank(t *testing.T) {fnAssertTrue := func(b bool, msg string) {
        if !b {t.Fatal(msg)
        }
    }

    t.Log("testing simple case")
    p11 := pr.NewPage("p11")
    p12 := pr.NewPage("p12")
    p13 := pr.NewPage("p13")
    p21 := pr.NewPage("p21")
    p22 := pr.NewPage("p22")
    p31 := pr.NewPage("p31")
    p32 := pr.NewPage("p32")

    p11.AddLink(p21)
    p11.AddLink(p22)
    p12.AddLink(p21)
    p12.AddLink(p22)
    p13.AddLink(p21)
    p13.AddLink(p22)

    p21.AddLink(p31)
    p22.AddLink(p31)

    p31.AddLink(p32)
    p32.AddLink(p31)

    samples := []pr.IPage{p11,p12,p13, p21, p22, p31, p32,}
    pr.RandomWalkPageRanking.RankAll(samples, 1000)
    sort.Sort(sort.Reverse(pr.IPageSlice(samples)))
    for _,p := range samples {t.Log(p)
    }
    fnAssertTrue(samples[0].ID() == "p31", "expecting top.1 = p31")
    fnAssertTrue(samples[1].ID() == "p32", "expecting top.2 = p32")
    fnAssertTrue(samples[2].ID() == "p21" || samples[2].ID() == "p22", "expecting top.3 in (p21,p22)")
    fnAssertTrue(samples[3].ID() == "p21" || samples[3].ID() == "p22", "expecting top.4 in (p21,p22)")


    // generate 1000 random pages
    iPageCount := 1000
    pages := make([]pr.IPage, iPageCount)
    for i,_ := range pages {pages[i] = pr.NewPage(fmt.Sprintf("p%02d.com", i))
    }

    r := rand.New(rand.NewSource(time.Now().UnixNano()))
    for i,p := range pages {
        // add random page links
        for j,iPageLinks := 0, r.Intn(10);j < iPageLinks; {n := r.Intn(iPageCount)
            if n != i {
                j++
                p.AddLink(pages[n])
            }
        }
    }

    // rank pages and get top 100
    mapTop100 := make(map[string]bool, 0)
    fnTestRanking := func(rounds int) {t.Logf("testing page rank with %v rounds", rounds)
        bFirstRound := len(mapTop100) == 0

        // page ranking
        pr.RandomWalkPageRanking.RankAll(pages, rounds)

        // sort pages by ranking
        sort.Sort(sort.Reverse(pr.IPageSlice(pages)))

        hits := 0
        for i,p := range pages {
            if i < 10 {t.Log(p)
            }

            if i < 100 {
                if bFirstRound {mapTop100[p.ID()] = true
                } else if _,ok := mapTop100[p.ID()];ok {hits++}
            } else {break}
        }

        if !bFirstRound {t.Logf("hit rate: %s%v", "%", hits)
        }
    }

    // test stability under different rounds
    fnTestRanking(3000)
    fnTestRanking(10000)
    fnTestRanking(30000)
    fnTestRanking(90000)
}

测试输入

  • 测试表明, 当随机游走的总次数 >= 网页数量 * 每个网页的均匀收回链接数时, 所得排名是比较稳定的
$ go test -v page_rank_test.go 
=== RUN   Test_PageRank
    page_rank_test.go:19: testing simple case
    page_rank_test.go:47: p(p31,   0.4206, [p32])
    page_rank_test.go:47: p(p32,   0.3673, [p31])
    page_rank_test.go:47: p(p21,   0.0639, [p31])
    page_rank_test.go:47: p(p22,   0.0604, [p31])
    page_rank_test.go:47: p(p11,   0.0300, [p21 p22])
    page_rank_test.go:47: p(p12,   0.0291, [p21 p22])
    page_rank_test.go:47: p(p13,   0.0287, [p21 p22])
    page_rank_test.go:77: testing page rank with 3000 rounds
    page_rank_test.go:89: p(p604.com,   0.0039, [])
    page_rank_test.go:89: p(p807.com,   0.0035, [p709.com p328.com p303.com p110.com p858.com p394.com p231.com p731.com p83.com])
    page_rank_test.go:89: p(p72.com,   0.0034, [p249.com p347.com p604.com p533.com p416.com p958.com p966.com p385.com])
    page_rank_test.go:89: p(p712.com,   0.0033, [p485.com p451.com p236.com p141.com p168.com p693.com])
    page_rank_test.go:89: p(p452.com,   0.0032, [p588.com p344.com p618.com p258.com p394.com p285.com p780.com p606.com p89.com])
    page_rank_test.go:89: p(p709.com,   0.0031, [p666.com p554.com p103.com p202.com p230.com])
    page_rank_test.go:89: p(p975.com,   0.0029, [])
    page_rank_test.go:89: p(p840.com,   0.0029, [p974.com p698.com p49.com p851.com p73.com])
    page_rank_test.go:89: p(p867.com,   0.0028, [p588.com p196.com p931.com p381.com p621.com p848.com])
    page_rank_test.go:89: p(p633.com,   0.0028, [p840.com])
    page_rank_test.go:77: testing page rank with 10000 rounds
    page_rank_test.go:89: p(p604.com,   0.0039, [])
    page_rank_test.go:89: p(p807.com,   0.0034, [p709.com p328.com p303.com p110.com p858.com p394.com p231.com p731.com p83.com])
    page_rank_test.go:89: p(p72.com,   0.0034, [p249.com p347.com p604.com p533.com p416.com p958.com p966.com p385.com])
    page_rank_test.go:89: p(p452.com,   0.0033, [p588.com p344.com p618.com p258.com p394.com p285.com p780.com p606.com p89.com])
    page_rank_test.go:89: p(p712.com,   0.0033, [p485.com p451.com p236.com p141.com p168.com p693.com])
    page_rank_test.go:89: p(p709.com,   0.0031, [p666.com p554.com p103.com p202.com p230.com])
    page_rank_test.go:89: p(p975.com,   0.0029, [])
    page_rank_test.go:89: p(p840.com,   0.0029, [p974.com p698.com p49.com p851.com p73.com])
    page_rank_test.go:89: p(p612.com,   0.0028, [p116.com p562.com p179.com p37.com p761.com])
    page_rank_test.go:89: p(p319.com,   0.0028, [p726.com p63.com p558.com p301.com p185.com p944.com])
    page_rank_test.go:104: hit rate: %98
    page_rank_test.go:77: testing page rank with 30000 rounds
    page_rank_test.go:89: p(p604.com,   0.0039, [])
    page_rank_test.go:89: p(p807.com,   0.0034, [p709.com p328.com p303.com p110.com p858.com p394.com p231.com p731.com p83.com])
    page_rank_test.go:89: p(p72.com,   0.0034, [p249.com p347.com p604.com p533.com p416.com p958.com p966.com p385.com])
    page_rank_test.go:89: p(p452.com,   0.0033, [p588.com p344.com p618.com p258.com p394.com p285.com p780.com p606.com p89.com])
    page_rank_test.go:89: p(p712.com,   0.0032, [p485.com p451.com p236.com p141.com p168.com p693.com])
    page_rank_test.go:89: p(p709.com,   0.0031, [p666.com p554.com p103.com p202.com p230.com])
    page_rank_test.go:89: p(p975.com,   0.0029, [])
    page_rank_test.go:89: p(p840.com,   0.0029, [p974.com p698.com p49.com p851.com p73.com])
    page_rank_test.go:89: p(p319.com,   0.0028, [p726.com p63.com p558.com p301.com p185.com p944.com])
    page_rank_test.go:89: p(p612.com,   0.0028, [p116.com p562.com p179.com p37.com p761.com])
    page_rank_test.go:104: hit rate: %98
    page_rank_test.go:77: testing page rank with 90000 rounds
    page_rank_test.go:89: p(p604.com,   0.0039, [])
    page_rank_test.go:89: p(p807.com,   0.0034, [p709.com p328.com p303.com p110.com p858.com p394.com p231.com p731.com p83.com])
    page_rank_test.go:89: p(p72.com,   0.0034, [p249.com p347.com p604.com p533.com p416.com p958.com p966.com p385.com])
    page_rank_test.go:89: p(p452.com,   0.0033, [p588.com p344.com p618.com p258.com p394.com p285.com p780.com p606.com p89.com])
    page_rank_test.go:89: p(p712.com,   0.0032, [p485.com p451.com p236.com p141.com p168.com p693.com])
    page_rank_test.go:89: p(p709.com,   0.0031, [p666.com p554.com p103.com p202.com p230.com])
    page_rank_test.go:89: p(p975.com,   0.0029, [])
    page_rank_test.go:89: p(p840.com,   0.0029, [p974.com p698.com p49.com p851.com p73.com])
    page_rank_test.go:89: p(p612.com,   0.0028, [p116.com p562.com p179.com p37.com p761.com])
    page_rank_test.go:89: p(p319.com,   0.0028, [p726.com p63.com p558.com p301.com p185.com p944.com])
    page_rank_test.go:104: hit rate: %98
--- PASS: Test_PageRank (13.93s)
PASS
ok      command-line-arguments  13.936s

IPage.go

网页模型接口

package page_rank

import "fmt"

type IPage interface {
    fmt.Stringer

    ID() string

    GetWeight() float64
    SetWeight(float64)

    GetLinks() []IPage
    AddLink(IPage)
}


type IPageSlice []IPage

func (me IPageSlice) Len() int {return len(me)
}

func (me IPageSlice) Less(i,j int) bool {return me[i].GetWeight() < me[j].GetWeight()}

func (me IPageSlice) Swap(i,j int) {me[i], me[j] = me[j], me[i]
}

IPageRanking.go

网页排名算法接口

package page_rank

type IPageRanking interface {RankAll(pages []IPage, rounds int)
}

tPage.go

网页模型的实现

package page_rank

import (
    "fmt"
    "strings"
)

type tPage struct {
    id string
    weight float64
    links []IPage}

func NewPage(id string) IPage {
    return &tPage{
        id: id,
        weight: 0,
        links: []IPage{},
    }
}

func (me *tPage) ID() string {return me.id}

func (me *tPage) GetWeight() float64 {return me.weight}

func (me *tPage) SetWeight(w float64) {me.weight = w}

func (me *tPage) GetLinks() []IPage {return me.links}

func (me *tPage) AddLink(p IPage) {me.links = append(me.links, p)
}

func (me *tPage) String() string {linkStrings := make([]string, len(me.links))
    for i,p := range me.links {linkStrings[i] = p.ID()}
    return fmt.Sprintf("p(%v, %8.4f, [%v])", me.id, me.weight, strings.Join(linkStrings, " "))
}

tRandomWalkPageRanking.go

基于随机游走模型的 PageRank 算法, 实现 IPageRanking 接口

package page_rank

import (
    "math/rand"
    "time"
)

type tRandomWalkPageRanking struct {
}

var gPossiblityToLinkedPage = 85

func newRandomWalkPageRanking() IPageRanking {return &tRandomWalkPageRanking{}
}

func (me *tRandomWalkPageRanking) RankAll(pages []IPage, rounds int) {iPageCount := len(pages)
    if iPageCount <= 0 {return}

    r := rand.New(rand.NewSource(time.Now().UnixNano()))
    current := pages[0]

    iVisitCount := iPageCount * rounds
    for i := 0;i < iVisitCount;i++ {
        // visit current page
        current.SetWeight(current.GetWeight() + 1)

        possibility := r.Intn(100)
        if possibility < gPossiblityToLinkedPage && len(current.GetLinks())>0 {
            // goto linked page
            current = me.VisitLinkedPage(current, r)

        } else {
            // goto unlinked page
            current = me.VisitUnlinkedPage(current, pages, r)
        }
    }

    fVisitCount := float64(iVisitCount)
    for _,p := range pages {p.SetWeight(p.GetWeight() / fVisitCount)
    }
}


func (me *tRandomWalkPageRanking) VisitLinkedPage(current IPage, r *rand.Rand) IPage {links := current.GetLinks()
    next := links[r.Intn(len(links))]
    return next
}

func (me *tRandomWalkPageRanking) VisitUnlinkedPage(current IPage, pages []IPage, r *rand.Rand) IPage {mapLinks := make(map[string]bool, 0)
    mapLinks[current.ID()] = true
    for _,p := range current.GetLinks() {mapLinks[p.ID()] = true
    }

    n := len(pages)
    for {next := pages[r.Intn(n)]
        if _,ok := mapLinks[next.ID()];!ok {return next}
    }
}

var RandomWalkPageRanking = newRandomWalkPageRanking()

(end)

退出移动版