前言
代码大部分来自
- https://fate.readthedocs.io/en/latest/tutorial/pipeline/nn_tu...
- https://fate.readthedocs.io/en/latest/tutorial/pipeline/nn_tu...
然而官网的文档不残缺,我本人记录残缺一下。
我用的是mnist数据集https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fate/examples/data/mnist.zip。
目录构造如下,横向的话,加载两次mnist就能够,而纵向一方加载mnist_guest带标签,一方加载mnist_host没有标签。mnist12两个文件夹没有用,不必管。
因为官网demo中的须要应用jupyter,不适宜一般Python代码,本文给出此例子。在Python的解释器上,要留神在环境变量里退出FATE的安装包里的bin/init_env.sh
外面的Python解释器门路,否则federatedml库会找不到。
横向
自定义数据集
自定义数据集,而后再本地测试一下。
import osfrom torchvision.datasets import ImageFolderfrom torchvision import transformsfrom federatedml.nn.dataset.base import Datasetclass MNISTDataset(Dataset): def __init__(self, flatten_feature=False): # flatten feature or not super(MNISTDataset, self).__init__() self.image_folder = None self.ids = None self.flatten_feature = flatten_feature def load(self, path): # read data from path, and set sample ids # read using ImageFolder self.image_folder = ImageFolder(root=path, transform=transforms.Compose([transforms.ToTensor()])) # filename as the image id ids = [] for image_name in self.image_folder.imgs: ids.append(image_name[0].split('/')[-1].replace('.jpg', '')) self.ids = ids return self def get_sample_ids(self): # implement the get sample id interface, simply return ids return self.ids def __len__(self,): # return the length of the dataset return len(self.image_folder) def __getitem__(self, idx): # get item ret = self.image_folder[idx] if self.flatten_feature: img = ret[0][0].flatten() # return flatten tensor 784-dim return img, ret[1] # return tensor and label else: return retds = MNISTDataset(flatten_feature=True)ds.load('mnist/')# print(len(ds))# print(ds[0])# print(ds.get_sample_ids()[0])
胜利输入后,要手动在FAET/federatedml.nn.datasets
下新建数据集文件,把上文的代码扩充成组件类的模式,如下
import torchfrom federatedml.nn.dataset.base import Datasetfrom torchvision.datasets import ImageFolderfrom torchvision import transformsimport numpy as np# 这里的包缺什么补什么哈class MNISTDataset(Dataset): def __init__(self, flatten_feature=False): # flatten feature or not super(MNISTDataset, self).__init__() self.image_folder = None self.ids = None self.flatten_feature = flatten_feature def load(self, path): # read data from path, and set sample ids # read using ImageFolder self.image_folder = ImageFolder(root=path, transform=transforms.Compose([transforms.ToTensor()])) # filename as the image id ids = [] for image_name in self.image_folder.imgs: ids.append(image_name[0].split('/')[-1].replace('.jpg', '')) self.ids = ids return self def get_sample_ids(self): # implement the get sample id interface, simply return ids return self.ids def __len__(self,): # return the length of the dataset return len(self.image_folder) def __getitem__(self, idx): # get item ret = self.image_folder[idx] if self.flatten_feature: img = ret[0][0].flatten() # return flatten tensor 784-dim return img, ret[1] # return tensor and label else: return retif __name__ == '__main__': pass
这样就实现了他官网文档所谓的“手动增加”了。增加后federatedml的目录应该是这样的文件名称要和下文的dataset param
对应。
增加后,FATE就“意识”咱们自建的数据集了。
下文中的local test是不须要做手动增加的步骤的,然而local只是做个测试。生产中没什么用……
横向训练
import osfrom torchvision.datasets import ImageFolderfrom torchvision import transformsfrom federatedml.nn.dataset.base import Dataset# test local process# from federatedml.nn.homo.trainer.fedavg_trainer import FedAVGTrainer# trainer = FedAVGTrainer(epochs=3, batch_size=256, shuffle=True, data_loader_worker=8, pin_memory=False) # set parameter# trainer.local_mode() # import torch as t# from pipeline import fate_torch_hook# fate_torch_hook(t)# # our simple classification model:# model = t.nn.Sequential(# t.nn.Linear(784, 32),# t.nn.ReLU(),# t.nn.Linear(32, 10),# t.nn.Softmax(dim=1)# )# trainer.set_model(model) # set model# optimizer = t.optim.Adam(model.parameters(), lr=0.01) # optimizer# loss = t.nn.CrossEntropyLoss() # loss function# trainer.train(train_set=ds, optimizer=optimizer, loss=loss) # use dataset we just developed# 必须在federatedml.nn.datasets目录下 手动退出新的数据集的信息!https://blog.csdn.net/Yonggie/article/details/129404212# real trainingimport torch as tfrom torch import nnfrom pipeline import fate_torch_hookfrom pipeline.component import HomoNNfrom pipeline.backend.pipeline import PipeLinefrom pipeline.component import Reader, Evaluation, DataTransformfrom pipeline.interface import Data, Modelt = fate_torch_hook(t)import os# bind data path to name & namespacefate_project_path = os.path.abspath('./')host = 1guest = 2arbiter = 3pipeline = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host, arbiter=arbiter)data_0 = {"name": "mnist_guest", "namespace": "experiment"}data_1 = {"name": "mnist_host", "namespace": "experiment"}data_path_0 = fate_project_path + '/mnist'data_path_1 = fate_project_path + '/mnist'pipeline.bind_table(name=data_0['name'], namespace=data_0['namespace'], path=data_path_0)pipeline.bind_table(name=data_1['name'], namespace=data_1['namespace'], path=data_path_1)reader_0 = Reader(name="reader_0")reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=data_0)reader_0.get_party_instance(role='host', party_id=host).component_param(table=data_1)from pipeline.component.nn import DatasetParamdataset_param = DatasetParam(dataset_name='mnist_dataset', flatten_feature=True) # specify dataset, and its init parametersfrom pipeline.component.homo_nn import TrainerParam # Interface# our simple classification model:model = t.nn.Sequential( t.nn.Linear(784, 32), t.nn.ReLU(), t.nn.Linear(32, 10), t.nn.Softmax(dim=1))nn_component = HomoNN(name='nn_0', model=model, # model loss=t.nn.CrossEntropyLoss(), # loss optimizer=t.optim.Adam(model.parameters(), lr=0.01), # optimizer dataset=dataset_param, # dataset trainer=TrainerParam(trainer_name='fedavg_trainer', epochs=2, batch_size=1024, validation_freqs=1), torch_seed=100 # random seed )pipeline.add_component(reader_0)pipeline.add_component(nn_component, data=Data(train_data=reader_0.output.data))pipeline.add_component(Evaluation(name='eval_0', eval_type='multi'), data=Data(data=nn_component.output.data))pipeline.compile()pipeline.fit()# print result and summarypipeline.get_component('nn_0').get_output_data()pipeline.get_component('nn_0').get_summary()
纵向
会用到mnist_host和mnist guest,下载
guest data: https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fa...
host data: https://webank-ai-1251170195.cos.ap-guangzhou.myqcloud.com/fa...
你查看一下数据集的格局。FATE外面的纵向,都是一方有标签,一方没标签,跟我所认知的合并数据集那种场景有差异。
纵向数据集
做法参考横向那里,我这里只给出新建的类的代码,跟横向的有一点点差异。
import torchfrom federatedml.nn.dataset.base import Datasetfrom torchvision.datasets import ImageFolderfrom torchvision import transformsimport numpy as npclass MNISTDataset(Dataset): def __init__(self, return_label=True): super(MNISTDataset, self).__init__() self.return_label = return_label self.image_folder = None self.ids = None def load(self, path): self.image_folder = ImageFolder(root=path, transform=transforms.Compose([transforms.ToTensor()])) ids = [] for image_name in self.image_folder.imgs: ids.append(image_name[0].split('/')[-1].replace('.jpg', '')) self.ids = ids return self def get_sample_ids(self, ): return self.ids def get_classes(self, ): return np.unique(self.image_folder.targets).tolist() def __len__(self,): return len(self.image_folder) def __getitem__(self, idx): # get item ret = self.image_folder[idx] img = ret[0][0].flatten() # flatten tensor 784 dims if self.return_label: return img, ret[1] # img & label else: return img # no label, for hostif __name__ == '__main__': pass
纵向训练
具体的正文都放在外面了。
import numpy as npfrom federatedml.nn.dataset.base import Datasetfrom torchvision.datasets import ImageFolderfrom torchvision import transforms# 本地定义的# class MNISTHetero(Dataset): # def __init__(self, return_label=True): # super(MNISTHetero, self).__init__() # self.return_label = return_label# self.image_folder = None# self.ids = None # def load(self, path): # self.image_folder = ImageFolder(root=path, transform=transforms.Compose([transforms.ToTensor()]))# ids = []# for image_name in self.image_folder.imgs:# ids.append(image_name[0].split('/')[-1].replace('.jpg', ''))# self.ids = ids# return self# def get_sample_ids(self, ):# return self.ids # def get_classes(self, ):# return np.unique(self.image_folder.targets).tolist() # def __len__(self,): # return len(self.image_folder) # def __getitem__(self, idx): # get item # ret = self.image_folder[idx]# img = ret[0][0].flatten() # flatten tensor 784 dims# if self.return_label:# return img, ret[1] # img & label# else:# return img # no label, for host# test guest dataset# ds = MNISTHetero().load('mnist_guest/')# print(len(ds))# print(ds[0][0]) # print(ds.get_classes())# print(ds.get_sample_ids()[0: 10])# test host dataset# ds = MNISTHetero(return_label=False).load('mnist_host')# print(len(ds))# print(ds[0]) # no labelimport osimport torch as tfrom torch import nnfrom pipeline import fate_torch_hookfrom pipeline.component import HeteroNNfrom pipeline.component.hetero_nn import DatasetParamfrom pipeline.backend.pipeline import PipeLinefrom pipeline.component import Reader, Evaluation, DataTransformfrom pipeline.interface import Data, Modelfate_torch_hook(t)# bind path to fate name&namespacefate_project_path = os.path.abspath('./')guest = 4host = 3pipeline_img = PipeLine().set_initiator(role='guest', party_id=guest).set_roles(guest=guest, host=host)guest_data = {"name": "mnist_guest", "namespace": "experiment"}host_data = {"name": "mnist_host", "namespace": "experiment"}guest_data_path = fate_project_path + '/mnist_guest'host_data_path = fate_project_path + '/mnist_host'pipeline_img.bind_table(name='mnist_guest', namespace='experiment', path=guest_data_path)pipeline_img.bind_table(name='mnist_host', namespace='experiment', path=host_data_path)guest_data = {"name": "mnist_guest", "namespace": "experiment"}host_data = {"name": "mnist_host", "namespace": "experiment"}reader_0 = Reader(name="reader_0")reader_0.get_party_instance(role='guest', party_id=guest).component_param(table=guest_data)reader_0.get_party_instance(role='host', party_id=host).component_param(table=host_data)# 这里为什么要这样定义,能够看文档,有模型https://fate.readthedocs.io/en/latest/federatedml_component/hetero_nn/hetero_nn_0 = HeteroNN(name="hetero_nn_0", epochs=3, interactive_layer_lr=0.01, batch_size=512, task_type='classification', seed=100 )guest_nn_0 = hetero_nn_0.get_party_instance(role='guest', party_id=guest)host_nn_0 = hetero_nn_0.get_party_instance(role='host', party_id=host)# define model# image features 784, guest bottom model# our simple classification model:guest_bottom = t.nn.Sequential( t.nn.Linear(784, 8), t.nn.ReLU())# image features 784, host bottom modelhost_bottom = t.nn.Sequential( t.nn.Linear(784, 8), t.nn.ReLU())# Top Model, a classifierguest_top = t.nn.Sequential( nn.Linear(8, 10), nn.Softmax(dim=1))# interactive layer defineinteractive_layer = t.nn.InteractiveLayer(out_dim=8, guest_dim=8, host_dim=8)# add models, 依据文档定义,guest要add2个,host只有一个guest_nn_0.add_top_model(guest_top)guest_nn_0.add_bottom_model(guest_bottom)host_nn_0.add_bottom_model(host_bottom)# opt, lossoptimizer = t.optim.Adam(lr=0.01) loss = t.nn.CrossEntropyLoss()# use DatasetParam to specify dataset and pass parameters# 留神和你手动退出的文件库名字要对应guest_nn_0.add_dataset(DatasetParam(dataset_name='mnist_hetero', return_label=True))host_nn_0.add_dataset(DatasetParam(dataset_name='mnist_hetero', return_label=False))hetero_nn_0.set_interactive_layer(interactive_layer)hetero_nn_0.compile(optimizer=optimizer, loss=loss)pipeline_img.add_component(reader_0)pipeline_img.add_component(hetero_nn_0, data=Data(train_data=reader_0.output.data))pipeline_img.add_component(Evaluation(name='eval_0', eval_type='multi'), data=Data(data=hetero_nn_0.output.data))pipeline_img.compile()pipeline_img.fit()# 取得后果pipeline_img.get_component('hetero_nn_0').get_output_data() # get result