diff --git a/README.md b/README.md index 9d867a0..c4ad6f1 100644 --- a/README.md +++ b/README.md @@ -30,4 +30,5 @@ torch>=1.2.0 1. Cora数据集无法下载 Cora数据集地址是:[kimiyoung/planetoid](https://github.com/kimiyoung/planetoid/tree/master/data)。 -仓库中提供了一份使用到的cora数据,可以分别将它放在 `chapter5/cora/raw` 或者 `chapter7/cora/raw` 目录下。 +~~仓库中提供了一份使用到的cora数据,可以分别将它放在 `chapter5/cora/raw` 或者 `chapter7/cora/raw` 目录下。~~ +新代码直接使用本地数据. diff --git a/chapter5/GCN_Cora.ipynb b/chapter5/GCN_Cora.ipynb index eaa1245..2a79164 100644 --- a/chapter5/GCN_Cora.ipynb +++ b/chapter5/GCN_Cora.ipynb @@ -96,11 +96,10 @@ "\n", "\n", "class CoraData(object):\n", - " download_url = \"https://github.com/kimiyoung/planetoid/raw/master/data\"\n", " filenames = [\"ind.cora.{}\".format(name) for name in\n", " ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']]\n", "\n", - " def __init__(self, data_root=\"cora\", rebuild=False):\n", + " def __init__(self, data_root=\"../data/cora\", rebuild=False):\n", " \"\"\"Cora数据,包括数据下载,处理,加载等功能\n", " 当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘\n", "\n", @@ -115,19 +114,18 @@ " Args:\n", " -------\n", " data_root: string, optional\n", - " 存放数据的目录,原始数据路径: {data_root}/raw\n", - " 缓存数据路径: {data_root}/processed_cora.pkl\n", + " 存放数据的目录,原始数据路径: ../data/cora\n", + " 缓存数据路径: {data_root}/ch5_cached.pkl\n", " rebuild: boolean, optional\n", " 是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据\n", "\n", " \"\"\"\n", " self.data_root = data_root\n", - " save_file = osp.join(self.data_root, \"processed_cora.pkl\")\n", + " save_file = osp.join(self.data_root, \"ch5_cached.pkl\")\n", " if osp.exists(save_file) and not rebuild:\n", " print(\"Using Cached file: {}\".format(save_file))\n", " self._data = pickle.load(open(save_file, \"rb\"))\n", " else:\n", - " self.maybe_download()\n", " self._data = self.process_data()\n", " with open(save_file, \"wb\") as f:\n", " pickle.dump(self.data, f)\n", @@ -145,7 +143,7 @@ " \"\"\"\n", " print(\"Process data ...\")\n", " _, tx, allx, y, ty, ally, graph, test_index = [self.read_data(\n", - " osp.join(self.data_root, \"raw\", name)) for name in self.filenames]\n", + " osp.join(self.data_root, name)) for name in self.filenames]\n", " train_index = np.arange(y.shape[0])\n", " val_index = np.arange(y.shape[0], y.shape[0] + 500)\n", " sorted_test_index = sorted(test_index)\n", @@ -174,13 +172,6 @@ " return Data(x=x, y=y, adjacency=adjacency,\n", " train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)\n", "\n", - " def maybe_download(self):\n", - " save_path = os.path.join(self.data_root, \"raw\")\n", - " for name in self.filenames:\n", - " if not osp.exists(osp.join(save_path, name)):\n", - " self.download_data(\n", - " \"{}/{}\".format(self.download_url, name), save_path)\n", - "\n", " @staticmethod\n", " def build_adjacency(adj_dict):\n", " \"\"\"根据邻接表创建邻接矩阵\"\"\"\n", @@ -210,19 +201,6 @@ " return out\n", "\n", " @staticmethod\n", - " def download_data(url, save_path):\n", - " \"\"\"数据下载工具,当原始数据不存在时将会进行下载\"\"\"\n", - " if not os.path.exists(save_path):\n", - " os.makedirs(save_path)\n", - " data = urllib.request.urlopen(url)\n", - " filename = os.path.split(url)[-1]\n", - "\n", - " with open(os.path.join(save_path, filename), 'wb') as f:\n", - " f.write(data.read())\n", - "\n", - " return True\n", - "\n", - " @staticmethod\n", " def normalization(adjacency):\n", " \"\"\"计算 L=D^-0.5 * (A+I) * D^-0.5\"\"\"\n", " adjacency += sp.eye(adjacency.shape[0]) # 增加自连接\n", diff --git a/chapter5/GCN_Cora.py b/chapter5/GCN_Cora.py index 9377f8f..53c888c 100644 --- a/chapter5/GCN_Cora.py +++ b/chapter5/GCN_Cora.py @@ -2,10 +2,20 @@ # coding: utf-8 #

Table of Contents

-#
+#
# # 基于Cora数据集的GCN节点分类 +# +# +# Run in Google Colab +# +# + +# 在Colab中运行时可以通过`代码执行程序->更改运行时类型`选择使用`GPU` + +# ## SetUp + # In[1]: @@ -35,12 +45,15 @@ 'train_mask', 'val_mask', 'test_mask']) +def tensor_from_numpy(x, device): + return torch.from_numpy(x).to(device) + + class CoraData(object): - download_url = "https://github.com/kimiyoung/planetoid/raw/master/data" filenames = ["ind.cora.{}".format(name) for name in ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']] - def __init__(self, data_root="cora", rebuild=False): + def __init__(self, data_root="../data/cora", rebuild=False): """Cora数据,包括数据下载,处理,加载等功能 当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘 @@ -55,19 +68,18 @@ def __init__(self, data_root="cora", rebuild=False): Args: ------- data_root: string, optional - 存放数据的目录,原始数据路径: {data_root}/raw - 缓存数据路径: {data_root}/processed_cora.pkl + 存放数据的目录,原始数据路径: ../data/cora + 缓存数据路径: {data_root}/ch5_cached.pkl rebuild: boolean, optional 是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据 """ self.data_root = data_root - save_file = osp.join(self.data_root, "processed_cora.pkl") + save_file = osp.join(self.data_root, "ch5_cached.pkl") if osp.exists(save_file) and not rebuild: print("Using Cached file: {}".format(save_file)) self._data = pickle.load(open(save_file, "rb")) else: - self.maybe_download() self._data = self.process_data() with open(save_file, "wb") as f: pickle.dump(self.data, f) @@ -85,7 +97,7 @@ def process_data(self): """ print("Process data ...") _, tx, allx, y, ty, ally, graph, test_index = [self.read_data( - osp.join(self.data_root, "raw", name)) for name in self.filenames] + osp.join(self.data_root, name)) for name in self.filenames] train_index = np.arange(y.shape[0]) val_index = np.arange(y.shape[0], y.shape[0] + 500) sorted_test_index = sorted(test_index) @@ -114,13 +126,6 @@ def process_data(self): return Data(x=x, y=y, adjacency=adjacency, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask) - def maybe_download(self): - save_path = os.path.join(self.data_root, "raw") - for name in self.filenames: - if not osp.exists(osp.join(save_path, name)): - self.download_data( - "{}/{}".format(self.download_url, name), save_path) - @staticmethod def build_adjacency(adj_dict): """根据邻接表创建邻接矩阵""" @@ -149,19 +154,6 @@ def read_data(path): out = out.toarray() if hasattr(out, "toarray") else out return out - @staticmethod - def download_data(url, save_path): - """数据下载工具,当原始数据不存在时将会进行下载""" - if not os.path.exists(save_path): - os.makedirs(save_path) - data = urllib.request.urlopen(url) - filename = os.path.split(url)[-1] - - with open(os.path.join(save_path, filename), 'wb') as f: - f.write(data.read()) - - return True - @staticmethod def normalization(adjacency): """计算 L=D^-0.5 * (A+I) * D^-0.5""" @@ -222,10 +214,12 @@ def forward(self, adjacency, input_feature): return output def __repr__(self): - return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')' + return self.__class__.__name__ + ' (' + str(self.input_dim) + ' -> ' + str(self.output_dim) + ')' # ## 模型定义 +# +# 读者可以自己对GCN模型结构进行修改和实验 # In[4]: @@ -251,19 +245,10 @@ def forward(self, adjacency, feature): # 超参数定义 -learning_rate = 0.1 -weight_decay = 5e-4 -epochs = 200 - - -# In[6]: - - -# 模型定义:Model, Loss, Optimizer -device = "cuda" if torch.cuda.is_available() else "cpu" -model = GcnNet().to(device) -criterion = nn.CrossEntropyLoss().to(device) -optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay) +LEARNING_RATE = 0.1 +WEIGHT_DACAY = 5e-4 +EPOCHS = 200 +DEVICE = "cuda" if torch.cuda.is_available() else "cpu" # In[7]: @@ -271,18 +256,31 @@ def forward(self, adjacency, feature): # 加载数据,并转换为torch.Tensor dataset = CoraData().data -x = dataset.x / dataset.x.sum(1, keepdims=True) # 归一化数据,使得每一行和为1 -tensor_x = torch.from_numpy(x).to(device) -tensor_y = torch.from_numpy(dataset.y).to(device) -tensor_train_mask = torch.from_numpy(dataset.train_mask).to(device) -tensor_val_mask = torch.from_numpy(dataset.val_mask).to(device) -tensor_test_mask = torch.from_numpy(dataset.test_mask).to(device) +node_feature = dataset.x / dataset.x.sum(1, keepdims=True) # 归一化数据,使得每一行和为1 +tensor_x = tensor_from_numpy(node_feature, DEVICE) +tensor_y = tensor_from_numpy(dataset.y, DEVICE) +tensor_train_mask = tensor_from_numpy(dataset.train_mask, DEVICE) +tensor_val_mask = tensor_from_numpy(dataset.val_mask, DEVICE) +tensor_test_mask = tensor_from_numpy(dataset.test_mask, DEVICE) normalize_adjacency = CoraData.normalization(dataset.adjacency) # 规范化邻接矩阵 + +num_nodes, input_dim = node_feature.shape indices = torch.from_numpy(np.asarray([normalize_adjacency.row, normalize_adjacency.col]).astype('int64')).long() values = torch.from_numpy(normalize_adjacency.data.astype(np.float32)) tensor_adjacency = torch.sparse.FloatTensor(indices, values, - (2708, 2708)).to(device) + (num_nodes, num_nodes)).to(DEVICE) + + +# In[ ]: + + +# 模型定义:Model, Loss, Optimizer +model = GcnNet(input_dim).to(DEVICE) +criterion = nn.CrossEntropyLoss().to(DEVICE) +optimizer = optim.Adam(model.parameters(), + lr=LEARNING_RATE, + weight_decay=WEIGHT_DACAY) # In[8]: @@ -294,7 +292,7 @@ def train(): val_acc_history = [] model.train() train_y = tensor_y[tensor_train_mask] - for epoch in range(epochs): + for epoch in range(EPOCHS): logits = model(tensor_adjacency, tensor_x) # 前向传播 train_mask_logits = logits[tensor_train_mask] # 只选择训练节点进行监督 loss = criterion(train_mask_logits, train_y) # 计算损失值 @@ -365,5 +363,14 @@ def plot_loss_with_acc(loss_history, val_acc_history): # In[ ]: - +# 绘制测试数据的TSNE降维图 +from sklearn.manifold import TSNE +tsne = TSNE() +out = tsne.fit_transform(test_logits) +fig = plt.figure() +for i in range(7): + indices = test_label == i + x, y = out[indices].T + plt.scatter(x, y, label=str(i)) +plt.legend() diff --git a/chapter7/data.py b/chapter7/data.py index 1e6005c..2034ada 100644 --- a/chapter7/data.py +++ b/chapter7/data.py @@ -13,11 +13,10 @@ class CoraData(object): - download_url = "https://github.com/kimiyoung/planetoid/raw/master/data" filenames = ["ind.cora.{}".format(name) for name in ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']] - def __init__(self, data_root="cora", rebuild=False): + def __init__(self, data_root="../data/cora", rebuild=False): """Cora数据,包括数据下载,处理,加载等功能 当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘 @@ -32,19 +31,18 @@ def __init__(self, data_root="cora", rebuild=False): Args: ------- data_root: string, optional - 存放数据的目录,原始数据路径: {data_root}/raw - 缓存数据路径: {data_root}/processed_cora.pkl + 存放数据的目录,原始数据路径: ../data/cora + 缓存数据路径: {data_root}/ch7_cached.pkl rebuild: boolean, optional 是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据 """ self.data_root = data_root - save_file = osp.join(self.data_root, "processed_cora.pkl") + save_file = osp.join(self.data_root, "ch7_cached.pkl") if osp.exists(save_file) and not rebuild: print("Using Cached file: {}".format(save_file)) self._data = pickle.load(open(save_file, "rb")) else: - self.maybe_download() self._data = self.process_data() with open(save_file, "wb") as f: pickle.dump(self.data, f) @@ -62,7 +60,7 @@ def process_data(self): """ print("Process data ...") _, tx, allx, y, ty, ally, graph, test_index = [self.read_data( - osp.join(self.data_root, "raw", name)) for name in self.filenames] + osp.join(self.data_root, name)) for name in self.filenames] train_index = np.arange(y.shape[0]) val_index = np.arange(y.shape[0], y.shape[0] + 500) sorted_test_index = sorted(test_index) @@ -91,13 +89,6 @@ def process_data(self): return Data(x=x, y=y, adjacency_dict=adjacency_dict, train_mask=train_mask, val_mask=val_mask, test_mask=test_mask) - def maybe_download(self): - save_path = os.path.join(self.data_root, "raw") - for name in self.filenames: - if not osp.exists(osp.join(save_path, name)): - self.download_data( - "{}/{}".format(self.download_url, name), save_path) - @staticmethod def build_adjacency(adj_dict): """根据邻接表创建邻接矩阵""" @@ -125,16 +116,3 @@ def read_data(path): out = pickle.load(open(path, "rb"), encoding="latin1") out = out.toarray() if hasattr(out, "toarray") else out return out - - @staticmethod - def download_data(url, save_path): - """数据下载工具,当原始数据不存在时将会进行下载""" - if not os.path.exists(save_path): - os.makedirs(save_path) - data = urllib.request.urlopen(url) - filename = os.path.split(url)[-1] - - with open(os.path.join(save_path, filename), 'wb') as f: - f.write(data.read()) - - return True diff --git a/chapter7/main.py b/chapter7/main.py index 76e07df..8818478 100644 --- a/chapter7/main.py +++ b/chapter7/main.py @@ -1,3 +1,4 @@ +#coding: utf-8 """ 基于Cora的GraphSage示例 """ diff --git "a/345円213円230円350円257円257円.pdf" "b/345円213円230円350円257円257円.pdf" index fc12089..da39fbe 100644 Binary files "a/345円213円230円350円257円257円.pdf" and "b/345円213円230円350円257円257円.pdf" differ

AltStyle によって変換されたページ (->オリジナル) /