diff --git a/README.md b/README.md
index 9d867a0..c4ad6f1 100644
--- a/README.md
+++ b/README.md
@@ -30,4 +30,5 @@ torch>=1.2.0
1. Cora数据集无法下载
Cora数据集地址是:[kimiyoung/planetoid](https://github.com/kimiyoung/planetoid/tree/master/data)。
-仓库中提供了一份使用到的cora数据,可以分别将它放在 `chapter5/cora/raw` 或者 `chapter7/cora/raw` 目录下。
+~~仓库中提供了一份使用到的cora数据,可以分别将它放在 `chapter5/cora/raw` 或者 `chapter7/cora/raw` 目录下。~~
+新代码直接使用本地数据.
diff --git a/chapter5/GCN_Cora.ipynb b/chapter5/GCN_Cora.ipynb
index eaa1245..2a79164 100644
--- a/chapter5/GCN_Cora.ipynb
+++ b/chapter5/GCN_Cora.ipynb
@@ -96,11 +96,10 @@
"\n",
"\n",
"class CoraData(object):\n",
- " download_url = \"https://github.com/kimiyoung/planetoid/raw/master/data\"\n",
" filenames = [\"ind.cora.{}\".format(name) for name in\n",
" ['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']]\n",
"\n",
- " def __init__(self, data_root=\"cora\", rebuild=False):\n",
+ " def __init__(self, data_root=\"../data/cora\", rebuild=False):\n",
" \"\"\"Cora数据,包括数据下载,处理,加载等功能\n",
" 当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘\n",
"\n",
@@ -115,19 +114,18 @@
" Args:\n",
" -------\n",
" data_root: string, optional\n",
- " 存放数据的目录,原始数据路径: {data_root}/raw\n",
- " 缓存数据路径: {data_root}/processed_cora.pkl\n",
+ " 存放数据的目录,原始数据路径: ../data/cora\n",
+ " 缓存数据路径: {data_root}/ch5_cached.pkl\n",
" rebuild: boolean, optional\n",
" 是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据\n",
"\n",
" \"\"\"\n",
" self.data_root = data_root\n",
- " save_file = osp.join(self.data_root, \"processed_cora.pkl\")\n",
+ " save_file = osp.join(self.data_root, \"ch5_cached.pkl\")\n",
" if osp.exists(save_file) and not rebuild:\n",
" print(\"Using Cached file: {}\".format(save_file))\n",
" self._data = pickle.load(open(save_file, \"rb\"))\n",
" else:\n",
- " self.maybe_download()\n",
" self._data = self.process_data()\n",
" with open(save_file, \"wb\") as f:\n",
" pickle.dump(self.data, f)\n",
@@ -145,7 +143,7 @@
" \"\"\"\n",
" print(\"Process data ...\")\n",
" _, tx, allx, y, ty, ally, graph, test_index = [self.read_data(\n",
- " osp.join(self.data_root, \"raw\", name)) for name in self.filenames]\n",
+ " osp.join(self.data_root, name)) for name in self.filenames]\n",
" train_index = np.arange(y.shape[0])\n",
" val_index = np.arange(y.shape[0], y.shape[0] + 500)\n",
" sorted_test_index = sorted(test_index)\n",
@@ -174,13 +172,6 @@
" return Data(x=x, y=y, adjacency=adjacency,\n",
" train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)\n",
"\n",
- " def maybe_download(self):\n",
- " save_path = os.path.join(self.data_root, \"raw\")\n",
- " for name in self.filenames:\n",
- " if not osp.exists(osp.join(save_path, name)):\n",
- " self.download_data(\n",
- " \"{}/{}\".format(self.download_url, name), save_path)\n",
- "\n",
" @staticmethod\n",
" def build_adjacency(adj_dict):\n",
" \"\"\"根据邻接表创建邻接矩阵\"\"\"\n",
@@ -210,19 +201,6 @@
" return out\n",
"\n",
" @staticmethod\n",
- " def download_data(url, save_path):\n",
- " \"\"\"数据下载工具,当原始数据不存在时将会进行下载\"\"\"\n",
- " if not os.path.exists(save_path):\n",
- " os.makedirs(save_path)\n",
- " data = urllib.request.urlopen(url)\n",
- " filename = os.path.split(url)[-1]\n",
- "\n",
- " with open(os.path.join(save_path, filename), 'wb') as f:\n",
- " f.write(data.read())\n",
- "\n",
- " return True\n",
- "\n",
- " @staticmethod\n",
" def normalization(adjacency):\n",
" \"\"\"计算 L=D^-0.5 * (A+I) * D^-0.5\"\"\"\n",
" adjacency += sp.eye(adjacency.shape[0]) # 增加自连接\n",
diff --git a/chapter5/GCN_Cora.py b/chapter5/GCN_Cora.py
index 9377f8f..53c888c 100644
--- a/chapter5/GCN_Cora.py
+++ b/chapter5/GCN_Cora.py
@@ -2,10 +2,20 @@
# coding: utf-8
#
Table of Contents
-#
+#
# # 基于Cora数据集的GCN节点分类
+#
+#
+# Run in Google Colab
+#
+#
+
+# 在Colab中运行时可以通过`代码执行程序->更改运行时类型`选择使用`GPU`
+
+# ## SetUp
+
# In[1]:
@@ -35,12 +45,15 @@
'train_mask', 'val_mask', 'test_mask'])
+def tensor_from_numpy(x, device):
+ return torch.from_numpy(x).to(device)
+
+
class CoraData(object):
- download_url = "https://github.com/kimiyoung/planetoid/raw/master/data"
filenames = ["ind.cora.{}".format(name) for name in
['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']]
- def __init__(self, data_root="cora", rebuild=False):
+ def __init__(self, data_root="../data/cora", rebuild=False):
"""Cora数据,包括数据下载,处理,加载等功能
当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘
@@ -55,19 +68,18 @@ def __init__(self, data_root="cora", rebuild=False):
Args:
-------
data_root: string, optional
- 存放数据的目录,原始数据路径: {data_root}/raw
- 缓存数据路径: {data_root}/processed_cora.pkl
+ 存放数据的目录,原始数据路径: ../data/cora
+ 缓存数据路径: {data_root}/ch5_cached.pkl
rebuild: boolean, optional
是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据
"""
self.data_root = data_root
- save_file = osp.join(self.data_root, "processed_cora.pkl")
+ save_file = osp.join(self.data_root, "ch5_cached.pkl")
if osp.exists(save_file) and not rebuild:
print("Using Cached file: {}".format(save_file))
self._data = pickle.load(open(save_file, "rb"))
else:
- self.maybe_download()
self._data = self.process_data()
with open(save_file, "wb") as f:
pickle.dump(self.data, f)
@@ -85,7 +97,7 @@ def process_data(self):
"""
print("Process data ...")
_, tx, allx, y, ty, ally, graph, test_index = [self.read_data(
- osp.join(self.data_root, "raw", name)) for name in self.filenames]
+ osp.join(self.data_root, name)) for name in self.filenames]
train_index = np.arange(y.shape[0])
val_index = np.arange(y.shape[0], y.shape[0] + 500)
sorted_test_index = sorted(test_index)
@@ -114,13 +126,6 @@ def process_data(self):
return Data(x=x, y=y, adjacency=adjacency,
train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
- def maybe_download(self):
- save_path = os.path.join(self.data_root, "raw")
- for name in self.filenames:
- if not osp.exists(osp.join(save_path, name)):
- self.download_data(
- "{}/{}".format(self.download_url, name), save_path)
-
@staticmethod
def build_adjacency(adj_dict):
"""根据邻接表创建邻接矩阵"""
@@ -149,19 +154,6 @@ def read_data(path):
out = out.toarray() if hasattr(out, "toarray") else out
return out
- @staticmethod
- def download_data(url, save_path):
- """数据下载工具,当原始数据不存在时将会进行下载"""
- if not os.path.exists(save_path):
- os.makedirs(save_path)
- data = urllib.request.urlopen(url)
- filename = os.path.split(url)[-1]
-
- with open(os.path.join(save_path, filename), 'wb') as f:
- f.write(data.read())
-
- return True
-
@staticmethod
def normalization(adjacency):
"""计算 L=D^-0.5 * (A+I) * D^-0.5"""
@@ -222,10 +214,12 @@ def forward(self, adjacency, input_feature):
return output
def __repr__(self):
- return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'
+ return self.__class__.__name__ + ' (' + str(self.input_dim) + ' -> ' + str(self.output_dim) + ')'
# ## 模型定义
+#
+# 读者可以自己对GCN模型结构进行修改和实验
# In[4]:
@@ -251,19 +245,10 @@ def forward(self, adjacency, feature):
# 超参数定义
-learning_rate = 0.1
-weight_decay = 5e-4
-epochs = 200
-
-
-# In[6]:
-
-
-# 模型定义:Model, Loss, Optimizer
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = GcnNet().to(device)
-criterion = nn.CrossEntropyLoss().to(device)
-optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
+LEARNING_RATE = 0.1
+WEIGHT_DACAY = 5e-4
+EPOCHS = 200
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
# In[7]:
@@ -271,18 +256,31 @@ def forward(self, adjacency, feature):
# 加载数据,并转换为torch.Tensor
dataset = CoraData().data
-x = dataset.x / dataset.x.sum(1, keepdims=True) # 归一化数据,使得每一行和为1
-tensor_x = torch.from_numpy(x).to(device)
-tensor_y = torch.from_numpy(dataset.y).to(device)
-tensor_train_mask = torch.from_numpy(dataset.train_mask).to(device)
-tensor_val_mask = torch.from_numpy(dataset.val_mask).to(device)
-tensor_test_mask = torch.from_numpy(dataset.test_mask).to(device)
+node_feature = dataset.x / dataset.x.sum(1, keepdims=True) # 归一化数据,使得每一行和为1
+tensor_x = tensor_from_numpy(node_feature, DEVICE)
+tensor_y = tensor_from_numpy(dataset.y, DEVICE)
+tensor_train_mask = tensor_from_numpy(dataset.train_mask, DEVICE)
+tensor_val_mask = tensor_from_numpy(dataset.val_mask, DEVICE)
+tensor_test_mask = tensor_from_numpy(dataset.test_mask, DEVICE)
normalize_adjacency = CoraData.normalization(dataset.adjacency) # 规范化邻接矩阵
+
+num_nodes, input_dim = node_feature.shape
indices = torch.from_numpy(np.asarray([normalize_adjacency.row,
normalize_adjacency.col]).astype('int64')).long()
values = torch.from_numpy(normalize_adjacency.data.astype(np.float32))
tensor_adjacency = torch.sparse.FloatTensor(indices, values,
- (2708, 2708)).to(device)
+ (num_nodes, num_nodes)).to(DEVICE)
+
+
+# In[ ]:
+
+
+# 模型定义:Model, Loss, Optimizer
+model = GcnNet(input_dim).to(DEVICE)
+criterion = nn.CrossEntropyLoss().to(DEVICE)
+optimizer = optim.Adam(model.parameters(),
+ lr=LEARNING_RATE,
+ weight_decay=WEIGHT_DACAY)
# In[8]:
@@ -294,7 +292,7 @@ def train():
val_acc_history = []
model.train()
train_y = tensor_y[tensor_train_mask]
- for epoch in range(epochs):
+ for epoch in range(EPOCHS):
logits = model(tensor_adjacency, tensor_x) # 前向传播
train_mask_logits = logits[tensor_train_mask] # 只选择训练节点进行监督
loss = criterion(train_mask_logits, train_y) # 计算损失值
@@ -365,5 +363,14 @@ def plot_loss_with_acc(loss_history, val_acc_history):
# In[ ]:
-
+# 绘制测试数据的TSNE降维图
+from sklearn.manifold import TSNE
+tsne = TSNE()
+out = tsne.fit_transform(test_logits)
+fig = plt.figure()
+for i in range(7):
+ indices = test_label == i
+ x, y = out[indices].T
+ plt.scatter(x, y, label=str(i))
+plt.legend()
diff --git a/chapter7/data.py b/chapter7/data.py
index 1e6005c..2034ada 100644
--- a/chapter7/data.py
+++ b/chapter7/data.py
@@ -13,11 +13,10 @@
class CoraData(object):
- download_url = "https://github.com/kimiyoung/planetoid/raw/master/data"
filenames = ["ind.cora.{}".format(name) for name in
['x', 'tx', 'allx', 'y', 'ty', 'ally', 'graph', 'test.index']]
- def __init__(self, data_root="cora", rebuild=False):
+ def __init__(self, data_root="../data/cora", rebuild=False):
"""Cora数据,包括数据下载,处理,加载等功能
当数据的缓存文件存在时,将使用缓存文件,否则将下载、进行处理,并缓存到磁盘
@@ -32,19 +31,18 @@ def __init__(self, data_root="cora", rebuild=False):
Args:
-------
data_root: string, optional
- 存放数据的目录,原始数据路径: {data_root}/raw
- 缓存数据路径: {data_root}/processed_cora.pkl
+ 存放数据的目录,原始数据路径: ../data/cora
+ 缓存数据路径: {data_root}/ch7_cached.pkl
rebuild: boolean, optional
是否需要重新构建数据集,当设为True时,如果存在缓存数据也会重建数据
"""
self.data_root = data_root
- save_file = osp.join(self.data_root, "processed_cora.pkl")
+ save_file = osp.join(self.data_root, "ch7_cached.pkl")
if osp.exists(save_file) and not rebuild:
print("Using Cached file: {}".format(save_file))
self._data = pickle.load(open(save_file, "rb"))
else:
- self.maybe_download()
self._data = self.process_data()
with open(save_file, "wb") as f:
pickle.dump(self.data, f)
@@ -62,7 +60,7 @@ def process_data(self):
"""
print("Process data ...")
_, tx, allx, y, ty, ally, graph, test_index = [self.read_data(
- osp.join(self.data_root, "raw", name)) for name in self.filenames]
+ osp.join(self.data_root, name)) for name in self.filenames]
train_index = np.arange(y.shape[0])
val_index = np.arange(y.shape[0], y.shape[0] + 500)
sorted_test_index = sorted(test_index)
@@ -91,13 +89,6 @@ def process_data(self):
return Data(x=x, y=y, adjacency_dict=adjacency_dict,
train_mask=train_mask, val_mask=val_mask, test_mask=test_mask)
- def maybe_download(self):
- save_path = os.path.join(self.data_root, "raw")
- for name in self.filenames:
- if not osp.exists(osp.join(save_path, name)):
- self.download_data(
- "{}/{}".format(self.download_url, name), save_path)
-
@staticmethod
def build_adjacency(adj_dict):
"""根据邻接表创建邻接矩阵"""
@@ -125,16 +116,3 @@ def read_data(path):
out = pickle.load(open(path, "rb"), encoding="latin1")
out = out.toarray() if hasattr(out, "toarray") else out
return out
-
- @staticmethod
- def download_data(url, save_path):
- """数据下载工具,当原始数据不存在时将会进行下载"""
- if not os.path.exists(save_path):
- os.makedirs(save_path)
- data = urllib.request.urlopen(url)
- filename = os.path.split(url)[-1]
-
- with open(os.path.join(save_path, filename), 'wb') as f:
- f.write(data.read())
-
- return True
diff --git a/chapter7/main.py b/chapter7/main.py
index 76e07df..8818478 100644
--- a/chapter7/main.py
+++ b/chapter7/main.py
@@ -1,3 +1,4 @@
+#coding: utf-8
"""
基于Cora的GraphSage示例
"""
diff --git "a/345円213円230円350円257円257円.pdf" "b/345円213円230円350円257円257円.pdf"
index fc12089..da39fbe 100644
Binary files "a/345円213円230円350円257円257円.pdf" and "b/345円213円230円350円257円257円.pdf" differ