基于Sklearn机器学习实战—基于Sklearn模块的链路预测

34次阅读

共计 5346 个字符,预计需要花费 14 分钟才能阅读完成。

Sklearn 简介
自 2007 年发布以来,scikit-learn 已经成为 Python 重要的机器学习库了。scikit-learn 简称 sklearn,支持包括分类、回归、降维和聚类四大机器学习算法。还包含了特征提取、数据处理和模型评估三大模块。
  sklearn 是 Scipy 的扩展,建立在 NumPy 和 matplotlib 库的基础上。利用这几大模块的优势,可以大大提高机器学习的效率。sklearn 拥有着完善的文档,上手容易,具有着丰富的 API,在学术界颇受欢迎。sklearn 已经封装了大量的机器学习算法,包括 LIBSVM 和 LIBINEAR。同时 sklearn 内置了大量数据集,节省了获取和整理数据集的时间。
项目简介
链路预测是通过历史连接信息预测未来可能产生的连接,即通过当前网络中的连边信息预测将来可能产生的连边信息。

项目源码
from sklearn.model_selection import train_test_split # 分割数据模块 from sklearn.neighbors import KNeighborsClassifier # K 最近邻 (kNN,k-NearestNeighbor) 分类算法 import pandas as pdimport numpy as npfrom sklearn.model_selection import train_test_splitfrom sklearn.neighbors import KNeighborsClassifierfrom sklearn import preprocessingimport matplotlib.pyplot as pltfrom sklearn.svm import SVCfrom math import isnan
定义计算共同邻居指标的方法
define some functions to calculate some baseline index
计算 Jaccard 相似性指标
def Jaccavrd(MatrixAdjacency_Train):

Matrix_similarity = np.dot(MatrixAdjacency_Train,MatrixAdjacency_Train)

deg_row = sum(MatrixAdjacency_Train)
deg_row.shape = (deg_row.shape[0],1)
deg_row_T = deg_row.T
tempdeg = deg_row + deg_row_T
temp = tempdeg – Matrix_similarity

Matrix_similarity = Matrix_similarity / temp
return Matrix_similarity

定义计算 Salton 指标的方法
def Salton_Cal(MatrixAdjacency_Train):
similarity = np.dot(MatrixAdjacency_Train,MatrixAdjacency_Train)

deg_row = sum(MatrixAdjacency_Train)
deg_row.shape = (deg_row.shape[0],1)
deg_row_T = deg_row.T
tempdeg = np.dot(deg_row,deg_row_T)
temp = np.sqrt(tempdeg)

np.seterr(divide=’ignore’, invalid=’ignore’)
Matrix_similarity = np.nan_to_num(similarity / temp)

Matrix_similarity = np.nan_to_num(Matrix_similarity)
return Matrix_similarity

def file2matrix(filepath):
f = open(filepath)
lines = f.readlines()
matrix = np.zeros((50, 50), dtype=float)
A_row = 0
for line in lines:
list = line.strip(‘\n’).split(‘ ‘)
matrix[A_row:] = list[0:50]
A_row += 1
return matrix

filepath = ‘3600/s0001.txt’MatrixAdjacency = file2matrix(filepath)
similarity_matrix_Jaccavrd = Jaccavrd(MatrixAdjacency)similarity_matrix_Salton = Salton_Cal(MatrixAdjacency)
filepath2 = ‘3600/s0002.txt’MatrixAdjacency2 = file2matrix(filepath2)
similarity_matrix_Jaccavrd2 = Jaccavrd(MatrixAdjacency2)similarity_matrix_Salton2 = Salton_Cal(MatrixAdjacency2)
filepath3 = ‘3600/s0003.txt’MatrixAdjacency3 = file2matrix(filepath3)
similarity_matrix_Jaccavrd3 = Jaccavrd(MatrixAdjacency3)similarity_matrix_Salton3 = Salton_Cal(MatrixAdjacency3)
获取 jaccard 相似性矩阵的行数和列数
Jaccard_Row = similarity_matrix_Jaccavrd.shape[0]Jaccard_Column = similarity_matrix_Jaccavrd.shape[1]Jaccard_List = []for i in range(Jaccard_Row):
for j in range(Jaccard_Column):
if i<j:
index = similarity_matrix_Jaccavrd[i,j]
if isnan(index) == True:
index = 0
Jaccard_List.append(index)

获取 Salton 相似性矩阵的行数和列数
Salton_Row = similarity_matrix_Salton.shape[0]Salton_Column = similarity_matrix_Salton.shape[1]Salton_List = []for i in range(Salton_Row):
for j in range(Salton_Column):
if i<j:
index = similarity_matrix_Salton[i,j]
if isnan(index) == True:
index = 0
Salton_List.append(index)

获取 jaccard 相似性矩阵的行数和列数
Jaccard_Row2 = similarity_matrix_Jaccavrd2.shape[0]Jaccard_Column2 = similarity_matrix_Jaccavrd2.shape[1]Jaccard_List2 = []for i in range(Jaccard_Row2):
for j in range(Jaccard_Column2):
if i<j:
index2 = similarity_matrix_Jaccavrd2[i,j]
if isnan(index2) == True:
index2 = 0
Jaccard_List2.append(index2)

获取 Salton 相似性矩阵的行数和列数
Salton_Row2 = similarity_matrix_Salton2.shape[0]Salton_Column2 = similarity_matrix_Salton2.shape[1]Salton_List2 = []for i in range(Salton_Row2):
for j in range(Salton_Column2):
if i<j:
index2 = similarity_matrix_Salton2[i,j]
if isnan(index2) == True:
index2 = 0
Salton_List2.append(index2)

获取 jaccard 相似性矩阵的行数和列数
Jaccard_Row3 = similarity_matrix_Jaccavrd3.shape[0]Jaccard_Column3 = similarity_matrix_Jaccavrd3.shape[1]Jaccard_List3 = []for i in range(Jaccard_Row3):
for j in range(Jaccard_Column3):
if i<j:
index3 = similarity_matrix_Jaccavrd3[i,j]
if isnan(index3) == True:
index3 = 0
Jaccard_List3.append(index3)

获取 Salton 相似性矩阵的行数和列数
Salton_Row3 = similarity_matrix_Salton3.shape[0]Salton_Column3 = similarity_matrix_Salton3.shape[1]Salton_List3 = []for i in range(Salton_Row3):
for j in range(Salton_Column3):
if i<j:
index3 = similarity_matrix_Salton3[i,j]
if isnan(index3) == True:
index3 = 0
Salton_List3.append(index3)

获取邻接矩阵的行数和列数
Adjacency_Row = MatrixAdjacency.shape[0]Adjacency_Column = MatrixAdjacency.shape[1]Adjacency = []for i in range(Adjacency_Row):
for j in range(Adjacency_Column):
if i<j:
index = MatrixAdjacency[i,j]
Adjacency.append(index)

获取邻接矩阵的行数和列数
Adjacency_Row2 = MatrixAdjacency2.shape[0]Adjacency_Column2 = MatrixAdjacency2.shape[1]Adjacency2 = []for i in range(Adjacency_Row2):
for j in range(Adjacency_Column2):
if i<j:
index2 = MatrixAdjacency2[i,j]
Adjacency2.append(index2)

获取邻接矩阵的行数和列数
Adjacency_Row3 = MatrixAdjacency3.shape[0]Adjacency_Column3 = MatrixAdjacency3.shape[1]Adjacency3 = []for i in range(Adjacency_Row3):
for j in range(Adjacency_Column3):
if i<j:
index3 = MatrixAdjacency3[i,j]
Adjacency3.append(index3)

data = np.zeros((1225,3))data2 = np.zeros((1225,3))data3 = np.zeros((1225,3))
for i in range(1225):
data[i][0] = Jaccard_List[i]
data[i][1] = Salton_List[i]
data[i][2] = Adjacency[i]

for j in range(1225):
data2[j][0] = Jaccard_List2[j]
data2[j][1] = Salton_List2[j]
data2[j][2] = Adjacency2[j]

for k in range(1225):
data3[k][0] = Jaccard_List3[k]
data3[k][1] = Salton_List3[k]
data3[k][2] = Adjacency3[k]

data_train_X = data[:,0:2]data_train_y = data[:,2]
data_test_X = data2[:,0:2]data_test_y = data2[:,2]
data_target_X = data3[:,0:2]data_target_y = data3[:,2]
knn = KNeighborsClassifier()knn.fit(data_train_X,data_train_y)
print(knn.predict(data_test_X))
print(data_test_y)
clf = SVC()clf.fit(data_train_X,data_test_y)
print(clf.score(data_test_X,data_target_y))
项目详细了解
如需详细本项目信息,可发送邮件至 18770918982@gmail.com

正文完
 0