Metapath2vec的python简单实现
这⾥我们使⽤三张图结构 分别是paper-coauhor-paper,paper-cotitle-paper,paper-covenue-paper,也就是结点类型⼀种,边类型三种,我们的metapath类型为coauthor-cotitle-coauthor-cotitle.
def positive_sampler(path):
'''
托管服务对每⼀条path建⽴⼀个window⼤⼩的滑动窗⼝,
例如 0 1 2 3 4,window⼤⼩为2,则返回 pos_u=[0,0,1,],pos_v=[1,2,0,]
'''
pos_u,pos_v=[],[]
for i in range(len(path)):
if len(path)==1:
continue
u=path[i]
atenate([path[max(i-window,0):i],path[i+1:i+window+1]],axis=0)
d([u]*len(v))
d(v)
return pos_u,pos_v
def get_negative_ratio(metapath):
'''
对所有结点根据出现频率建⽴negative_ratio,出现频率越⼤的越有可能出现在负采样中
返回的ratio 是对每⼀个结点被负采样的概率
'''
node_frequency=dict()
ntence_count,node_count=0,0
for path in metapath:
for node in path:
node_frequency[node]=(node,0)+1
node_count+=1
pow_frequency=np.array(list(map(lambda x:x[-1],sorted(node_frequency.items(),key=lambda asd:asd[0]))))**0.75女怕思凡
node_pow=np.sum(pow_frequency)
ratio=pow_frequency/node_pow
return ratio
def negative_sampler(path,ratio,nodes):
'''
根据上⼀个函数的到负采样的概率表ratio,进⾏负采样
'''
negtives_size=5
negatives=[]
while len(negatives)<5:
temp=np.random.choice(nodes, size=negtives_size-len(negatives), replace=Fal, p=ratio)
return negatives
def create_node2node_dict(graph):
多肉作文400字'''
输⼊的是dgl建⽴的图
返回的是个字典类型,保存的是在该图中,每个结点可以到达的结点
'''
src_dst={}
for src,dst in zip(graph.edges()[0],graph.edges()[1]):
src,dst=src.item(),dst.item()
if src not in src_dst.keys():
src_dst[src]=[]
src_dst[src].append(dst)
return src_dst
window=2# 这⾥是取metapath时的窗⼝⼤⼩
metapaths=[]#所有的metapath
num_walks=10#每个结点run 多少遍
walk_len=100#每个path的长度
metapath_type=['coauthor','covenue','coauthor','cotitle']#根据论⽂,作者使⽤的是AVAT
edge_per_graph={}#对应每个图,建⽴个字典,每个字典的key为结点编号,value为key在该图中可以到达的结点编号
edge_per_graph['coauthor']=create_node2node_dict(coauthor_graph)
edge_per_graph['cotitle']=create_node2node_dict(cotitle_graph)
edge_per_graph['covenue']=create_node2node_dict(covenue_graph)
weights_all_graph={'coauthor':weights_coauthor,'cotitle':weights_cotitle,'covenue':weights_covenue}
def Is_isolate(node):
for rel in metapath_type:
if node in edge_per_graph[rel].keys():
return 0
return 1
for walk in tqdm(range(num_walks)):
for cur_node in list(range(len(labels))):#对图⾥的每个结点循环⼀次
stop=0
path=[]
path.append(cur_node)
while len(path)<walk_len and stop==0:
for rel in metapath_type:
if len(path)==walk_len or Is_isolate(cur_node):
stop=1
break
if edge_per_graph[rel].get(cur_node,-1)==-1:
continue
cand_nodes=edge_per_graph[rel][cur_node]
weights_per_candnodes=weights_all_graph[rel][cur_node][cand_nodes]#这⼉我们使⽤了带权重的路径选择,如果不需要,把这句和下⾯这句注释掉,然
weighted_ratio=weights_per_candnodes*1.0/np.sum(weights_per_candnodes)
cur_node=np.random.choice(cand_nodes,size=1,p=weighted_ratio)[0]
path.append(cur_node)上水平
metapaths.append(path)
pos_us,pos_vs,neg_vs=[],[],[]
nodes=list(range(sum_papers))
ratio=get_negative_ratio(metapaths)
for path in metapaths:
工作沟通pos_u,pos_v=positive_sampler(path)
for u,v in zip(pos_u,pos_v):
negative_nodes=negative_sampler(path,ratio,nodes)
neg_vs.append(negative_nodes)
d(pos_u)
d(pos_v)
pos_us=torch.LongTensor(pos_us)
pos_vs=torch.LongTensor(pos_vs)
neg_vs=torch.LongTensor(neg_vs)
这⼉得到的metapaths就是总的metapath,pos_us和pos_vs的元素⼀ ⼀对应作为正对,neg_vs作为负对,对应着skip_gram模型的ui,uc,uj,如下
#单纯的metapath2vec
import torch
as nn
functional as F
import init
"""
u_embedding: Embedding for center word.
v_embedding: Embedding for neighbor words.
"""
class SkipGramModel(nn.Module):
质量保证def __init__(lf, emb_size, emb_dimension):
super(SkipGramModel, lf).__init__()
网球肘的症状lf.emb_dimension = emb_dimension
lf.u_embeddings = nn.Embedding(emb_size, emb_dimension)
lf.v_embeddings = nn.Embedding(emb_size, emb_dimension)
initrange = 1.0 / lf.emb_dimension
素颜霜是干嘛的init.uniform_(lf.u_embeddings.weight.data, -initrange, initrange)
def forward(lf, pos_u, pos_v, neg_v):
emb_u = lf.u_embeddings(pos_u)
emb_v = lf.v_embeddings(pos_v)
emb_neg_v = lf.v_embeddings(neg_v)
score = torch.sum(torch.mul(emb_u, emb_v), dim=1)
score = torch.clamp(score, max=10, min=-10)
score = -F.logsigmoid(score)
neg_score = torch.bmm(emb_neg_v, emb_u.unsqueeze(2)).squeeze()
neg_score = torch.clamp(neg_score, max=10, min=-10)
neg_score = -torch.sum(F.logsigmoid(-neg_score), dim=1)
an(score + neg_score)
skip_model=SkipGramModel(sum_papers,64)
optimizer=torch.optim.Adam(skip_model.parameters(),lr=0.001)
for epoch in range(500):
<_grad()
loss=skip_sor(pos_us),sor(pos_vs),sor(neg_vs))
loss.backward()
optimizer.step()
loss.append(loss.item())
if epoch %100==0:
print('epoch {0} loss {1}'.format(epoch,loss))
embedding=skip_model.u_embeddings.weight.cpu().data.numpy()
这⼉embedding就是得到的每个结点的embdding,可以⽤来做下游任务