diff --git a/subject2-pre/.keep b/subject2-pre/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/subject2-pre/Figs/.keep b/subject2-pre/Figs/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/subject2-pre/Figs/cpu.png b/subject2-pre/Figs/cpu.png new file mode 100644 index 0000000000000000000000000000000000000000..b8dcb9f6576e33103eca4eb7e75b05d387d174de Binary files /dev/null and b/subject2-pre/Figs/cpu.png differ diff --git a/subject2-pre/Figs/framework.png b/subject2-pre/Figs/framework.png new file mode 100644 index 0000000000000000000000000000000000000000..e4195d861c5474608eec34808519b4da09e6bbf5 Binary files /dev/null and b/subject2-pre/Figs/framework.png differ diff --git a/subject2-pre/Figs/mem.png b/subject2-pre/Figs/mem.png new file mode 100644 index 0000000000000000000000000000000000000000..3d4272efc1f3b2ea8db5c0d11579cd037f1429b5 Binary files /dev/null and b/subject2-pre/Figs/mem.png differ diff --git a/subject2-pre/Figs/ourgan.png b/subject2-pre/Figs/ourgan.png new file mode 100644 index 0000000000000000000000000000000000000000..d1507144287c349fbbfa28fac3c65e8c5b76663a Binary files /dev/null and b/subject2-pre/Figs/ourgan.png differ diff --git a/subject2-pre/README.md b/subject2-pre/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e6c61b96726b95689d224a51dd1a12d1c6d575de --- /dev/null +++ b/subject2-pre/README.md @@ -0,0 +1,88 @@ +# JC-workload-predictor + + + +This repository provides the implementation of the paper "Joint Privacy-Preserving Cloud Workload Prediction Based on Federated Learning", which is published in XXX. +In this paper, we propose a joint privacy-preserving cloud workload prediction framework based on Federated Learning (FL), which enables the collaborative training of a workload prediction model across cloud providers with heterogeneous workloads and model preferences without exposing private workload data. +we first design a Generative Adversarial Network (GAN) based virtual workload dataset generation method that incorporates workloads' temporal and resource-usage-plan-related features via temporal-aware feature calibration. +Then, we design a FL approach based on a hybrid local model composed of a homogeneous public model and a heterogeneous personal model. +Finally, we design a Knowledge Distillation (KD) based approach to post-train the personal model with both private workload knowledge and shared workload knowledge learned from the trained global public model. +Extensive experiments on various real-world workloads demonstrate that compared with the state-of-the-art, our framework improves workload prediction accuracy by 45.5\% in average over all cloud providers. + + + + + + + + + + +
MSEs of memory utilization prediction results (lower is better).MSEs of CPU utilization prediction results (lower is better).
+ +Our framework consists of the following three key components: +

+ +

+ + + +## 1. Virtual Workload Dataset Generation + + + +

+ +

+ +The code in the folder [code_generate_data](https://gitee.com/jointcloud_computing/joint-cloud-computing/tree/subject2/subject2-pre/all-process/generate_data) is the code to generate virtual data, including GAN, VHL and our TVHL. +The input is the path of the dataset. + + + + +## 2. Federated Learning with Virtual Workload Datasets + + +FedAvg was used for training, using different datasets. + +Folder [code_fedavg](https://gitee.com/jointcloud_computing/joint-cloud-computing/tree/subject2/subject2-pre/all-process/fedavg) is using the actual dataset. + +Folder [code_ourfl-nokd](https://gitee.com/jointcloud_computing/joint-cloud-computing/tree/subject2/subject2-pre/all-process/ourfl-nokd) is the use of virtual generated dataset. + + + +## 3. Post Training of Personal Models. + + +The code in the folde [code_ourfl_kd](https://gitee.com/jointcloud_computing/joint-cloud-computing/tree/subject2/subject2-pre/all-process/ourfl_kd) shows the training of the individual model after the local public model is trained. + + + +## Prerequisites + +To run the code, it needs some libraies: + +- Python >= 3.7 +- Pytorch >= 2.0.1 +- torchvision >= 0.15.2 + +Our environment is shown in the file, named [environment.yaml](https://gitee.com/jointcloud_computing/joint-cloud-computing/edit/subject2/subject2-pre/environment.yaml). + +## Citing + + + +If you use this repository, please cite: +```bibtex +@article{, + title={}, + author={Bin yang and Yan, Li}, + journal={}, + volume={}, + year={}, + publisher={} +} +``` + + diff --git a/subject2-pre/all-process/.keep b/subject2-pre/all-process/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/subject2-pre/all-process/fedavg/.keep b/subject2-pre/all-process/fedavg/.keep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/subject2-pre/all-process/fedavg/fed_gan.py b/subject2-pre/all-process/fedavg/fed_gan.py new file mode 100644 index 0000000000000000000000000000000000000000..abfa9cb3789aa4ec951a5b7cf2ba41b6b2e4cf59 --- /dev/null +++ b/subject2-pre/all-process/fedavg/fed_gan.py @@ -0,0 +1,228 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import MinMaxScaler +import torch.optim as optim # 添加导入torch.optim模块 + + +# 定义配置类 +class Config(): + timestep = 4 + batch_size = 32 + feature_size = 1 + hidden_size = 256 + output_size = 1 + num_layers = 2 + epochs = 4 + learning_rate = 0.0005 + model_name = 'gru' + save_path = './{}.pth'.format(model_name) + +config = Config() + +# 定义GRU模型 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + output, h_0 = self.gru(x, h_0) + + batch_size, timestep, hidden_size = output.shape + + output = output.reshape(-1, hidden_size) + + output = self.fc(output) + + output = output.reshape(batch_size, timestep, -1) + + return output[:, -1, :] + + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] + dataY = [] + + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) + + dataX = np.array(dataX) + dataY = np.array(dataY) + + train_size = int(np.round(0.8 * dataX.shape[0])) + + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + + +data1 = pd.read_csv(r'F:\ourfl\genedate\GAN\gan_alibaba.csv') +data2 = pd.read_csv(r'F:\ourfl\genedate\GAN\gan_google.csv') +data3 = pd.read_csv(r'F:\ourfl\genedate\GAN\gan_mircosoft.csv') +data4 = pd.read_csv(r'F:\ourfl\genedate\GAN\gan_sugan1.csv') + +# 提取avg_mem列 +df1 = data1[['mem_use']] +df2 = data2[['mem_use']] +df3 = data3[['mem_use']] +df4 = data4[['mem_use']] +# 标准化数据 +scaler = MinMaxScaler(feature_range=(0, 1)) +data1 = scaler.fit_transform(df1.values) +data2 = scaler.fit_transform(df2.values) +data3 = scaler.fit_transform(df3.values) +data4 = scaler.fit_transform(df4.values) + +# 定义时间步数和特征数 +timestep = 4 # 您可以根据实际情况调整时间步数 +feature_size = 1 # 由于您已经提取了"mem_use"列作为特征,所以特征数为1 + +# 划分数据集 +x_train1, y_train1, x_test1, y_test1 = split_data(data1, timestep, feature_size) +x_train2, y_train2, x_test2, y_test2 = split_data(data2, timestep, feature_size) +x_train3, y_train3, x_test3, y_test3 = split_data(data3, timestep, feature_size) +x_train4, y_train4, x_test4, y_test4 = split_data(data4, timestep, feature_size) + + +class ClientGRU(nn.Module): + def __init__(self, config): + super(ClientGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers, batch_first=True) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + output, hidden = self.gru(x, hidden) + output = self.fc(output[:, -1, :]) # 调整输出维度 + return output + + + + #定义全局GRU模型 +class GlobalGRU(nn.Module): + def __init__(self, config): + super(GlobalGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + # 自定义GlobalGRU类的前向传播过程,使用GRU和Linear层 + output, hidden = self.gru(x, hidden) + output = self.fc(output) + return output, hidden + + + + + +config = Config() +global_model = GlobalGRU(config) +client_models = [ClientGRU(config) for _ in range(4)] + + + +# 定义客户端训练函数 +def client_train(client_model, data, global_model_parameters, config): + # 将数据转换为PyTorch张量 + x_train, y_train, _, _ = data + x_train = torch.tensor(x_train, dtype=torch.float32) + y_train = torch.tensor(y_train, dtype=torch.float32) + # 设置优化器和损失函数 + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + # 设置初始隐藏状态(可根据需要调整) + hidden = None + # 将全局模型参数加载到客户端模型中 + client_model.load_state_dict(global_model_parameters) + # 进行本地训练 + for epoch in range(config.epochs): + optimizer.zero_grad() + # 前向传播 + output, hidden = client_model(x_train, hidden) + loss = criterion(output, y_train) + # 反向传播和参数更新 + loss.backward() + optimizer.step() + # 返回更新后的客户端模型参数 + return client_model.state_dict() + + + +# 定义全局模型参数平均函数 +def global_average_parameters(client_parameters_list): + num_parameters = len(list(client_parameters_list[0].values())) + sum_parameters = [torch.zeros_like(param) for param in list(client_parameters_list[0].values())] + + for client_parameters in client_parameters_list: + for i, param in enumerate(client_parameters.values()): + sum_parameters[i] += param + + averaged_parameters = [param_sum / len(client_parameters_list) for param_sum in sum_parameters] + return averaged_parameters + +# ... (其他代码保持不变) + +# 设置全局epoch数 +global_epoch = 20 +# 将数据转换为PyTorch张量 +# 将数据转换为PyTorch张量并指定数据类型为float32 +x_train_list = [torch.tensor(x_train1, dtype=torch.float32), + torch.tensor(x_train2, dtype=torch.float32), + torch.tensor(x_train3, dtype=torch.float32), + torch.tensor(x_train4, dtype=torch.float32)] +y_train_list = [torch.tensor(y_train1, dtype=torch.float32), + torch.tensor(y_train2, dtype=torch.float32), + torch.tensor(y_train3, dtype=torch.float32), + torch.tensor(y_train4, dtype=torch.float32)] + +# 进行联邦学习的全局epoch循环 +for global_epoch in range(global_epoch): + updated_client_parameters_list = [] + + # 客户端训练并更新模型参数 + for client_model, x_train, y_train in zip(client_models, x_train_list, y_train_list): + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + + # 进行本地模型的训练 + for local_epoch in range(config.epochs): + optimizer.zero_grad() + output = client_model(x_train) # 不再解包 + loss = criterion(output, y_train) + loss.backward() + optimizer.step() + + # 获取更新后的客户端模型参数 + updated_client_parameters_list.append(client_model.state_dict()) + + # 全局参数平均化 + averaged_parameters = global_average_parameters(updated_client_parameters_list) + + # 更新全局模型的参数为平均后的参数 + with torch.no_grad(): + for global_param, averaged_param in zip(global_model.parameters(), averaged_parameters): + global_param.copy_(averaged_param) + + +# 保存每个客户端模型 +for i, client_model in enumerate(client_models): + model_path = f"client_model_{i+1}.pth" + torch.save(client_model.state_dict(), model_path) + diff --git a/subject2-pre/all-process/fedavg/fed_tvhl.py b/subject2-pre/all-process/fedavg/fed_tvhl.py new file mode 100644 index 0000000000000000000000000000000000000000..de7ca27ca2004fce462f2abe2173582152188634 --- /dev/null +++ b/subject2-pre/all-process/fedavg/fed_tvhl.py @@ -0,0 +1,225 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import MinMaxScaler +import torch.optim as optim # 添加导入torch.optim模块 + + +# 定义配置类 +class Config(): + timestep = 4 + batch_size = 32 + feature_size = 1 + hidden_size = 256 + output_size = 1 + num_layers = 2 + epochs = 10 + learning_rate = 0.0001 + model_name = 'gru' + save_path = './{}.pth'.format(model_name) + +config = Config() + +# 定义GRU模型 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + output, h_0 = self.gru(x, h_0) + + batch_size, timestep, hidden_size = output.shape + + output = output.reshape(-1, hidden_size) + + output = self.fc(output) + + output = output.reshape(batch_size, timestep, -1) + + return output[:, -1, :] + + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] + dataY = [] + + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) + + dataX = np.array(dataX) + dataY = np.array(dataY) + + train_size = int(np.round(0.6 * dataX.shape[0])) + + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + +data1 = pd.read_csv(r'F:\ourfl\genedate\TVHL\tvhl_alibaba.csv') +data2 = pd.read_csv(r'F:\ourfl\genedate\TVHL\tvhl_google.csv') +data3 = pd.read_csv(r'F:\ourfl\genedate\TVHL\tvhl_mircosoft.csv') +data4 = pd.read_csv(r'F:\ourfl\genedate\TVHL\tvhl_sugan1.csv') +# 提取avg_mem列 +df1 = data1[['mem_use']] +df2 = data2[['mem_use']] +df3 = data3[['mem_use']] +df4 = data4[['mem_use']] +# 标准化数据 +scaler = MinMaxScaler(feature_range=(0, 1)) +data1 = scaler.fit_transform(df1.values) +data2 = scaler.fit_transform(df2.values) +data3 = scaler.fit_transform(df3.values) +data4 = scaler.fit_transform(df4.values) + +# 定义时间步数和特征数 +timestep = 4 # 您可以根据实际情况调整时间步数 +feature_size = 1 # 由于您已经提取了"mem_use"列作为特征,所以特征数为1 + +# 划分数据集 +x_train1, y_train1, x_test1, y_test1 = split_data(data1, timestep, feature_size) +x_train2, y_train2, x_test2, y_test2 = split_data(data2, timestep, feature_size) +x_train3, y_train3, x_test3, y_test3 = split_data(data3, timestep, feature_size) +x_train4, y_train4, x_test4, y_test4 = split_data(data4, timestep, feature_size) + + +class ClientGRU(nn.Module): + def __init__(self, config): + super(ClientGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers, batch_first=True) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + output, hidden = self.gru(x, hidden) + output = self.fc(output[:, -1, :]) # 调整输出维度 + return output + + + + #定义全局GRU模型 +class GlobalGRU(nn.Module): + def __init__(self, config): + super(GlobalGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + # 自定义GlobalGRU类的前向传播过程,使用GRU和Linear层 + output, hidden = self.gru(x, hidden) + output = self.fc(output) + return output, hidden + + + + + +config = Config() +global_model = GlobalGRU(config) +client_models = [ClientGRU(config) for _ in range(4)] + + + +# 定义客户端训练函数 +def client_train(client_model, data, global_model_parameters, config): + # 将数据转换为PyTorch张量 + x_train, y_train, _, _ = data + x_train = torch.tensor(x_train, dtype=torch.float32) + y_train = torch.tensor(y_train, dtype=torch.float32) + # 设置优化器和损失函数 + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + # 设置初始隐藏状态(可根据需要调整) + hidden = None + # 将全局模型参数加载到客户端模型中 + client_model.load_state_dict(global_model_parameters) + # 进行本地训练 + for epoch in range(config.epochs): + optimizer.zero_grad() + # 前向传播 + output, hidden = client_model(x_train, hidden) + loss = criterion(output, y_train) + # 反向传播和参数更新 + loss.backward() + optimizer.step() + # 返回更新后的客户端模型参数 + return client_model.state_dict() + + + +# 定义全局模型参数平均函数 +def global_average_parameters(client_parameters_list): + num_parameters = len(list(client_parameters_list[0].values())) + sum_parameters = [torch.zeros_like(param) for param in list(client_parameters_list[0].values())] + + for client_parameters in client_parameters_list: + for i, param in enumerate(client_parameters.values()): + sum_parameters[i] += param + + averaged_parameters = [param_sum / len(client_parameters_list) for param_sum in sum_parameters] + return averaged_parameters + +# ... (其他代码保持不变) + +# 设置全局epoch数 +global_epoch = 10 +# 将数据转换为PyTorch张量 +# 将数据转换为PyTorch张量并指定数据类型为float32 +x_train_list = [torch.tensor(x_train1, dtype=torch.float32), + torch.tensor(x_train2, dtype=torch.float32), + torch.tensor(x_train3, dtype=torch.float32), + torch.tensor(x_train4, dtype=torch.float32)] +y_train_list = [torch.tensor(y_train1, dtype=torch.float32), + torch.tensor(y_train2, dtype=torch.float32), + torch.tensor(y_train3, dtype=torch.float32), + torch.tensor(y_train4, dtype=torch.float32)] + +# 进行联邦学习的全局epoch循环 +for global_epoch in range(global_epoch): + updated_client_parameters_list = [] + # 客户端训练并更新模型参数 + for client_model, x_train, y_train in zip(client_models, x_train_list, y_train_list): + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + + # 进行本地模型的训练 + for local_epoch in range(config.epochs): + optimizer.zero_grad() + output = client_model(x_train) # 不再解包 + loss = criterion(output, y_train) + loss.backward() + optimizer.step() + + # 获取更新后的客户端模型参数 + updated_client_parameters_list.append(client_model.state_dict()) + + # 全局参数平均化 + averaged_parameters = global_average_parameters(updated_client_parameters_list) + + # 更新全局模型的参数为平均后的参数 + with torch.no_grad(): + for global_param, averaged_param in zip(global_model.parameters(), averaged_parameters): + global_param.copy_(averaged_param) + + +# 保存每个客户端模型 +for i, client_model in enumerate(client_models): + model_path = f"tvhl_client_model_{i+1}.pth" + torch.save(client_model.state_dict(), model_path) + diff --git a/subject2-pre/all-process/fedavg/fed_vhl.py b/subject2-pre/all-process/fedavg/fed_vhl.py new file mode 100644 index 0000000000000000000000000000000000000000..4b153f6215a5fae59686e6cbbbc837a387dc63e4 --- /dev/null +++ b/subject2-pre/all-process/fedavg/fed_vhl.py @@ -0,0 +1,227 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import MinMaxScaler +import torch.optim as optim # 添加导入torch.optim模块 + + +# 定义配置类 +class Config(): + timestep = 1 + batch_size = 32 + feature_size = 1 + hidden_size = 256 + output_size = 1 + num_layers = 2 + epochs = 10 + learning_rate = 0.0001 + model_name = 'gru' + save_path = './{}.pth'.format(model_name) + +config = Config() + +# 定义GRU模型 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + output, h_0 = self.gru(x, h_0) + + batch_size, timestep, hidden_size = output.shape + + output = output.reshape(-1, hidden_size) + + output = self.fc(output) + + output = output.reshape(batch_size, timestep, -1) + + return output[:, -1, :] + + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] + dataY = [] + + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) + + dataX = np.array(dataX) + dataY = np.array(dataY) + + train_size = int(np.round(0.6 * dataX.shape[0])) + + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + + +data1 = pd.read_csv(r'F:\ourfl\genedate\VHL\vhl_alibaba.csv') +data2 = pd.read_csv(r'F:\ourfl\genedate\VHL\vhl_google.csv') +data3 = pd.read_csv(r'F:\ourfl\genedate\VHL\vhl_mircosoft.csv') +data4 = pd.read_csv(r'F:\ourfl\genedate\VHL\vhl_sugan1.csv') + +# 提取avg_mem列 +df1 = data1[['mem_use']] +df2 = data2[['mem_use']] +df3 = data3[['mem_use']] +df4 = data4[['mem_use']] +# 标准化数据 +scaler = MinMaxScaler(feature_range=(0, 1)) +data1 = scaler.fit_transform(df1.values) +data2 = scaler.fit_transform(df2.values) +data3 = scaler.fit_transform(df3.values) +data4 = scaler.fit_transform(df4.values) + +# 定义时间步数和特征数 +timestep = 1 # 您可以根据实际情况调整时间步数 +feature_size = 1 # 由于您已经提取了"mem_use"列作为特征,所以特征数为1 + +# 划分数据集 +x_train1, y_train1, x_test1, y_test1 = split_data(data1, timestep, feature_size) +x_train2, y_train2, x_test2, y_test2 = split_data(data2, timestep, feature_size) +x_train3, y_train3, x_test3, y_test3 = split_data(data3, timestep, feature_size) +x_train4, y_train4, x_test4, y_test4 = split_data(data4, timestep, feature_size) + + +class ClientGRU(nn.Module): + def __init__(self, config): + super(ClientGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers, batch_first=True) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + output, hidden = self.gru(x, hidden) + output = self.fc(output[:, -1, :]) # 调整输出维度 + return output + + + + #定义全局GRU模型 +class GlobalGRU(nn.Module): + def __init__(self, config): + super(GlobalGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + # 自定义GlobalGRU类的前向传播过程,使用GRU和Linear层 + output, hidden = self.gru(x, hidden) + output = self.fc(output) + return output, hidden + + + + + +config = Config() +global_model = GlobalGRU(config) +client_models = [ClientGRU(config) for _ in range(4)] + + + +# 定义客户端训练函数 +def client_train(client_model, data, global_model_parameters, config): + # 将数据转换为PyTorch张量 + x_train, y_train, _, _ = data + x_train = torch.tensor(x_train, dtype=torch.float32) + y_train = torch.tensor(y_train, dtype=torch.float32) + # 设置优化器和损失函数 + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + # 设置初始隐藏状态(可根据需要调整) + hidden = None + # 将全局模型参数加载到客户端模型中 + client_model.load_state_dict(global_model_parameters) + # 进行本地训练 + for epoch in range(config.epochs): + optimizer.zero_grad() + # 前向传播 + output, hidden = client_model(x_train, hidden) + loss = criterion(output, y_train) + # 反向传播和参数更新 + loss.backward() + optimizer.step() + # 返回更新后的客户端模型参数 + return client_model.state_dict() + + + +# 定义全局模型参数平均函数 +def global_average_parameters(client_parameters_list): + num_parameters = len(list(client_parameters_list[0].values())) + sum_parameters = [torch.zeros_like(param) for param in list(client_parameters_list[0].values())] + + for client_parameters in client_parameters_list: + for i, param in enumerate(client_parameters.values()): + sum_parameters[i] += param + + averaged_parameters = [param_sum / len(client_parameters_list) for param_sum in sum_parameters] + return averaged_parameters + +# ... (其他代码保持不变) + +# 设置全局epoch数 +global_epoch = 10 +# 将数据转换为PyTorch张量 +# 将数据转换为PyTorch张量并指定数据类型为float32 +x_train_list = [torch.tensor(x_train1, dtype=torch.float32), + torch.tensor(x_train2, dtype=torch.float32), + torch.tensor(x_train3, dtype=torch.float32), + torch.tensor(x_train4, dtype=torch.float32)] +y_train_list = [torch.tensor(y_train1, dtype=torch.float32), + torch.tensor(y_train2, dtype=torch.float32), + torch.tensor(y_train3, dtype=torch.float32), + torch.tensor(y_train4, dtype=torch.float32)] + +# 进行联邦学习的全局epoch循环 +for global_epoch in range(global_epoch): + updated_client_parameters_list = [] + # 客户端训练并更新模型参数 + for client_model, x_train, y_train in zip(client_models, x_train_list, y_train_list): + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + + # 进行本地模型的训练 + for local_epoch in range(config.epochs): + optimizer.zero_grad() + output = client_model(x_train) + loss = criterion(output, y_train) + loss.backward() + optimizer.step() + + # 获取更新后的客户端模型参数 + updated_client_parameters_list.append(client_model.state_dict()) + + # 全局参数平均化 + averaged_parameters = global_average_parameters(updated_client_parameters_list) + + # 更新全局模型的参数为平均后的参数 + with torch.no_grad(): + for global_param, averaged_param in zip(global_model.parameters(), averaged_parameters): + global_param.copy_(averaged_param) + + +# 保存每个客户端模型 +for i, client_model in enumerate(client_models): + model_path = f"vhl_client_model_{i+1}.pth" + torch.save(client_model.state_dict(), model_path) + diff --git a/subject2-pre/all-process/fedavg/fedavg.py b/subject2-pre/all-process/fedavg/fedavg.py new file mode 100644 index 0000000000000000000000000000000000000000..e43a00bd32ce813deda78bbfaa6939493e486448 --- /dev/null +++ b/subject2-pre/all-process/fedavg/fedavg.py @@ -0,0 +1,224 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import MinMaxScaler +import torch.optim as optim # 添加导入torch.optim模块 + + +# 定义配置类 +class Config(): + timestep = 1 + batch_size = 32 + feature_size = 1 + hidden_size = 256 + output_size = 1 + num_layers = 2 + epochs = 20 + learning_rate = 0.0001 + model_name = 'gru' + save_path = './{}.pth'.format(model_name) + +config = Config() + +# 定义GRU模型 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + output, h_0 = self.gru(x, h_0) + + batch_size, timestep, hidden_size = output.shape + + output = output.reshape(-1, hidden_size) + + output = self.fc(output) + + output = output.reshape(batch_size, timestep, -1) + + return output[:, -1, :] + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] + dataY = [] + + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) + + dataX = np.array(dataX) + dataY = np.array(dataY) + + train_size = int(np.round(0.6 * dataX.shape[0])) + + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + +data1 = pd.read_csv(r"F:\ourfl\data\processed_alibaba.csv") + +data2 = pd.read_csv(r"F:\ourfl\data\pre_google.csv") +data3 = pd.read_csv(r"F:\ourfl\data\pre_microsoft.csv") +data4 = pd.read_csv(r"F:\ourfl\data\processed_hefei.csv") +# 提取avg_mem列 +df1 = data1[['mem_use']] +df2 = data2[['mem_use']] +df3 = data3[['mem_use']] +df4 = data4[['mem_use']] +# 标准化数据 +scaler = MinMaxScaler(feature_range=(0, 1)) +data1 = scaler.fit_transform(df1.values) +data2 = scaler.fit_transform(df2.values) +data3 = scaler.fit_transform(df3.values) +data4 = scaler.fit_transform(df4.values) + +# 定义时间步数和特征数 +timestep = 1 # 您可以根据实际情况调整时间步数 +feature_size = 1 # 由于您已经提取了"mem_use"列作为特征,所以特征数为1 + +# 划分数据集 +x_train1, y_train1, x_test1, y_test1 = split_data(data1, timestep, feature_size) +x_train2, y_train2, x_test2, y_test2 = split_data(data2, timestep, feature_size) +x_train3, y_train3, x_test3, y_test3 = split_data(data3, timestep, feature_size) +x_train4, y_train4, x_test4, y_test4 = split_data(data4, timestep, feature_size) + + +class ClientGRU(nn.Module): + def __init__(self, config): + super(ClientGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers, batch_first=True) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + output, hidden = self.gru(x, hidden) + output = self.fc(output[:, -1, :]) # 调整输出维度 + return output + + + + #定义全局GRU模型 +class GlobalGRU(nn.Module): + def __init__(self, config): + super(GlobalGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + # 自定义GlobalGRU类的前向传播过程,使用GRU和Linear层 + output, hidden = self.gru(x, hidden) + output = self.fc(output) + return output, hidden + + + + +config = Config() +global_model = GlobalGRU(config) +client_models = [ClientGRU(config) for _ in range(4)] + + + +# 定义客户端训练函数 +def client_train(client_model, data, global_model_parameters, config): + # 将数据转换为PyTorch张量 + x_train, y_train, _, _ = data + x_train = torch.tensor(x_train, dtype=torch.float32) + y_train = torch.tensor(y_train, dtype=torch.float32) + # 设置优化器和损失函数 + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + # 设置初始隐藏状态(可根据需要调整) + hidden = None + # 将全局模型参数加载到客户端模型中 + client_model.load_state_dict(global_model_parameters) + # 进行本地训练 + for epoch in range(config.epochs): + optimizer.zero_grad() + # 前向传播 + output, hidden = client_model(x_train, hidden) + loss = criterion(output, y_train) + # 反向传播和参数更新 + loss.backward() + optimizer.step() + # 返回更新后的客户端模型参数 + return client_model.state_dict() + + + +# 定义全局模型参数平均函数 +def global_average_parameters(client_parameters_list): + num_parameters = len(list(client_parameters_list[0].values())) + sum_parameters = [torch.zeros_like(param) for param in list(client_parameters_list[0].values())] + + for client_parameters in client_parameters_list: + for i, param in enumerate(client_parameters.values()): + sum_parameters[i] += param + + averaged_parameters = [param_sum / len(client_parameters_list) for param_sum in sum_parameters] + return averaged_parameters + +# ... (其他代码保持不变) + +# 设置全局epoch数 +global_epoch = 30 +# 将数据转换为PyTorch张量 +# 将数据转换为PyTorch张量并指定数据类型为float32 +x_train_list = [torch.tensor(x_train1, dtype=torch.float32), + torch.tensor(x_train2, dtype=torch.float32), + torch.tensor(x_train3, dtype=torch.float32), + torch.tensor(x_train4, dtype=torch.float32)] +y_train_list = [torch.tensor(y_train1, dtype=torch.float32), + torch.tensor(y_train2, dtype=torch.float32), + torch.tensor(y_train3, dtype=torch.float32), + torch.tensor(y_train4, dtype=torch.float32)] + +# 进行联邦学习的全局epoch循环 +for global_epoch in range(global_epoch): + updated_client_parameters_list = [] + # 客户端训练并更新模型参数 + for client_model, x_train, y_train in zip(client_models, x_train_list, y_train_list): + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + + # 进行本地模型的训练 + for local_epoch in range(config.epochs): + optimizer.zero_grad() + output = client_model(x_train) # 不再解包 + loss = criterion(output, y_train) + loss.backward() + optimizer.step() + + # 获取更新后的客户端模型参数 + updated_client_parameters_list.append(client_model.state_dict()) + + # 全局参数平均化 + averaged_parameters = global_average_parameters(updated_client_parameters_list) + + # 更新全局模型的参数为平均后的参数 + with torch.no_grad(): + for global_param, averaged_param in zip(global_model.parameters(), averaged_parameters): + global_param.copy_(averaged_param) + + +# 保存每个客户端模型 +for i, client_model in enumerate(client_models): + model_path = f"client_model_{i+1}.pth" + torch.save(client_model.state_dict(), model_path) + diff --git a/subject2-pre/all-process/generate_data/GAN/gan_alibaba.py b/subject2-pre/all-process/generate_data/GAN/gan_alibaba.py new file mode 100644 index 0000000000000000000000000000000000000000..f11342c7fe5e34dc8468ec4593a90fc04ff7e2a2 --- /dev/null +++ b/subject2-pre/all-process/generate_data/GAN/gan_alibaba.py @@ -0,0 +1,119 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError + + +data = pd.read_csv(r'F:\ourfl\data\processed_alibaba.csv') + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] +data['time'] = data['time'] % 24 +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features].values + + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + + +def train_gan(noise_data, real_data, epochs=3000, batch_size=32): + for epoch in range(epochs): + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + # 创建标签(真实数据样本为1,合成数据样本为0) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + # 训练判别器 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + # 训练生成器 + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + + # 打印损失和精度 + if epoch % 100 == 0: + print(f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +noise = np.random.normal(0, 1, size=(num_samples, 7)) +# 使用 data1 作为噪声数据集,data2 作为训练数据集训练 GAN 模型 +train_gan(noise, data2_scaled, epochs=3000, batch_size=32) + +num_samples = 20000 +num_rounds = 30 +num_hours_per_round = 24 + +# 生成虚拟数据 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() + +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\GAN\gan_alibaba.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/GAN/gan_google.py b/subject2-pre/all-process/generate_data/GAN/gan_google.py new file mode 100644 index 0000000000000000000000000000000000000000..fbf52078f1f101496f93f63e4420bc726915c0d0 --- /dev/null +++ b/subject2-pre/all-process/generate_data/GAN/gan_google.py @@ -0,0 +1,118 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError + + +data = pd.read_csv(r'F:\ourfl\data\pre_google.csv') +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] +data['time'] = data['time'] % 24 +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features].values + + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + + +def train_gan(noise_data, real_data, epochs=3000, batch_size=32): + for epoch in range(epochs): + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + # 创建标签(真实数据样本为1,合成数据样本为0) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + # 训练判别器 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + # 训练生成器 + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + + # 打印损失和精度 + if epoch % 100 == 0: + print(f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +noise = np.random.normal(0, 1, size=(num_samples, 7)) +# 使用 data1 作为噪声数据集,data2 作为训练数据集训练 GAN 模型 +train_gan(noise, data2_scaled, epochs=3000, batch_size=32) + +num_samples = 20000 +num_rounds = 30 +num_hours_per_round = 24 + +# 生成虚拟数据 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() + +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\GAN\gan_google.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/GAN/gan_hpc.py b/subject2-pre/all-process/generate_data/GAN/gan_hpc.py new file mode 100644 index 0000000000000000000000000000000000000000..8cf3b33f7fc5a9dd5311d7ea186aec9a8b9dbbfd --- /dev/null +++ b/subject2-pre/all-process/generate_data/GAN/gan_hpc.py @@ -0,0 +1,119 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError + + +data = pd.read_csv(r'F:\ourfl\data\processed_hefei.csv') + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] +data['time'] = data['time'] % 24 +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features].values + + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + + +def train_gan(noise_data, real_data, epochs=3000, batch_size=32): + for epoch in range(epochs): + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + # 创建标签(真实数据样本为1,合成数据样本为0) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + # 训练判别器 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + # 训练生成器 + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + + # 打印损失和精度 + if epoch % 100 == 0: + print(f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +noise = np.random.normal(0, 1, size=(num_samples, 7)) +# 使用 data1 作为噪声数据集,data2 作为训练数据集训练 GAN 模型 +train_gan(noise, data2_scaled, epochs=3000, batch_size=32) + +num_samples = 20000 +num_rounds = 30 +num_hours_per_round = 24 + +# 生成虚拟数据 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() + +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\GAN\gan_sugan1.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/GAN/gan_mircosoft.py b/subject2-pre/all-process/generate_data/GAN/gan_mircosoft.py new file mode 100644 index 0000000000000000000000000000000000000000..a771474f68b34e4b131060eb3f131747d1cb2c60 --- /dev/null +++ b/subject2-pre/all-process/generate_data/GAN/gan_mircosoft.py @@ -0,0 +1,119 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError + + +data = pd.read_csv(r'F:\ourfl\data\pre_microsoft.csv') + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] +data['time'] = data['time'] % 24 +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features].values + + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + + +def train_gan(noise_data, real_data, epochs=3000, batch_size=32): + for epoch in range(epochs): + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + # 创建标签(真实数据样本为1,合成数据样本为0) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + # 训练判别器 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + # 训练生成器 + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + + # 打印损失和精度 + if epoch % 100 == 0: + print(f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +noise = np.random.normal(0, 1, size=(num_samples, 7)) +# 使用 data1 作为噪声数据集,data2 作为训练数据集训练 GAN 模型 +train_gan(noise, data2_scaled, epochs=3000, batch_size=32) + +num_samples = 20000 +num_rounds = 30 +num_hours_per_round = 24 + +# 生成虚拟数据 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() + +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\GAN\gan_mircosoft.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/TVHL/tvhl_alibaba.py b/subject2-pre/all-process/generate_data/TVHL/tvhl_alibaba.py new file mode 100644 index 0000000000000000000000000000000000000000..b5687054ff2badbf49b97c1b10b681c827e3ca4c --- /dev/null +++ b/subject2-pre/all-process/generate_data/TVHL/tvhl_alibaba.py @@ -0,0 +1,137 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError +from scipy.spatial.distance import cdist + + + +data = pd.read_csv(r'F:\ourfl\data\processed_alibaba.csv') + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] + + +data['time'] = data['time'] % 24 + +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features].values + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + +def compute_joint_distribution_loss(real_samples, fake_samples): + # Extract the 'relative_time' column + real_relative_time = real_samples[:, 0] + fake_relative_time = fake_samples[:, 0] + + # Remove the 'relative_time' column from the samples + real_samples = real_samples[:, 1:] + fake_samples = fake_samples[:, 1:] + + # Calculate the Euclidean distance between samples without the 'relative_time' feature + distance =0.1* cdist(real_samples, fake_samples, metric='euclidean') + + # Compute the joint distribution loss with 'relative_time' as a condition + conditional_distance = np.abs(real_relative_time - fake_relative_time) + joint_distribution_loss = np.mean(distance * conditional_distance) + + return joint_distribution_loss + +#训练模型 +def train_gan(noise_data, real_data, epochs, batch_size): + for epoch in range(epochs): + noise = noise_data + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss + 0.15 * joint_distribution_loss + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +#设置高斯噪声(全局统一) +noise = np.random.normal(0, 1, size=(10000, 7)) +#训练模型 +train_gan(data, noise, epochs=3000, batch_size=32) +# 数据生成 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\TVHL\tvhl_alibaba.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/TVHL/tvhl_google.py b/subject2-pre/all-process/generate_data/TVHL/tvhl_google.py new file mode 100644 index 0000000000000000000000000000000000000000..c12d4e8608321cfbb4938048e6ce602038e72984 --- /dev/null +++ b/subject2-pre/all-process/generate_data/TVHL/tvhl_google.py @@ -0,0 +1,136 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError +from scipy.spatial.distance import cdist + + + +data = pd.read_csv(r'F:\ourfl\data\pre_google.csv') + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] + + +data['time'] = data['time'] % 24 + +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features].values + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + +def compute_joint_distribution_loss(real_samples, fake_samples): + # Extract the 'relative_time' column + real_relative_time = real_samples[:, 0] + fake_relative_time = fake_samples[:, 0] + + # Remove the 'relative_time' column from the samples + real_samples = real_samples[:, 1:] + fake_samples = fake_samples[:, 1:] + + # Calculate the Euclidean distance between samples without the 'relative_time' feature + distance =0.1* cdist(real_samples, fake_samples, metric='euclidean') + + # Compute the joint distribution loss with 'relative_time' as a condition + conditional_distance = np.abs(real_relative_time - fake_relative_time) + joint_distribution_loss = np.mean(distance * conditional_distance) + + return joint_distribution_loss + +#训练模型 +def train_gan(noise_data, real_data, epochs, batch_size): + for epoch in range(epochs): + noise = noise_data + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss + 0.15 * joint_distribution_loss + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +#设置高斯噪声(全局统一) +noise = np.random.normal(0, 1, size=(10000, 7)) +#训练模型 +train_gan(data, noise, epochs=3000, batch_size=32) +# 数据生成 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data +generated_data.to_csv(r'F:\ourfl\genedate\TVHL\tvhl_google.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/TVHL/tvhl_mircosoft.py b/subject2-pre/all-process/generate_data/TVHL/tvhl_mircosoft.py new file mode 100644 index 0000000000000000000000000000000000000000..12d96bb5522d303b7c1d839d5b14f915c5bab2ca --- /dev/null +++ b/subject2-pre/all-process/generate_data/TVHL/tvhl_mircosoft.py @@ -0,0 +1,137 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError +from scipy.spatial.distance import cdist + + + +data = pd.read_csv(r'F:\ourfl\data\pre_microsoft.csv') + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] + + +data['time'] = data['time'] % 24 + +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features].values + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + +def compute_joint_distribution_loss(real_samples, fake_samples): + # Extract the 'relative_time' column + real_relative_time = real_samples[:, 0] + fake_relative_time = fake_samples[:, 0] + + # Remove the 'relative_time' column from the samples + real_samples = real_samples[:, 1:] + fake_samples = fake_samples[:, 1:] + + # Calculate the Euclidean distance between samples without the 'relative_time' feature + distance =0.1* cdist(real_samples, fake_samples, metric='euclidean') + + # Compute the joint distribution loss with 'relative_time' as a condition + conditional_distance = np.abs(real_relative_time - fake_relative_time) + joint_distribution_loss = np.mean(distance * conditional_distance) + + return joint_distribution_loss + +#训练模型 +def train_gan(noise_data, real_data, epochs, batch_size): + for epoch in range(epochs): + noise = noise_data + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss + 0.15 * joint_distribution_loss + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +#设置高斯噪声(全局统一) +noise = np.random.normal(0, 1, size=(10000, 7)) +#训练模型 +train_gan(data, noise, epochs=3000, batch_size=32) +# 数据生成 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\VHL\vhl_microsoft.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/TVHL/tvhl_sugan.py b/subject2-pre/all-process/generate_data/TVHL/tvhl_sugan.py new file mode 100644 index 0000000000000000000000000000000000000000..7fda7eeb1921865e0f2b850b1feb234b71e080e3 --- /dev/null +++ b/subject2-pre/all-process/generate_data/TVHL/tvhl_sugan.py @@ -0,0 +1,137 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError +from scipy.spatial.distance import cdist + + + +data = pd.read_csv(r'F:\ourfl\data\processed_hefei.csv') + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] + + +data['time'] = data['time'] % 24 + +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features].values + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + +def compute_joint_distribution_loss(real_samples, fake_samples): + # Extract the 'relative_time' column + real_relative_time = real_samples[:, 0] + fake_relative_time = fake_samples[:, 0] + + # Remove the 'relative_time' column from the samples + real_samples = real_samples[:, 1:] + fake_samples = fake_samples[:, 1:] + + # Calculate the Euclidean distance between samples without the 'relative_time' feature + distance =0.1* cdist(real_samples, fake_samples, metric='euclidean') + + # Compute the joint distribution loss with 'relative_time' as a condition + conditional_distance = np.abs(real_relative_time - fake_relative_time) + joint_distribution_loss = np.mean(distance * conditional_distance) + + return joint_distribution_loss + +#训练模型 +def train_gan(noise_data, real_data, epochs, batch_size): + for epoch in range(epochs): + noise = noise_data + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss + 0.15 * joint_distribution_loss + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +#设置高斯噪声(全局统一) +noise = np.random.normal(0, 1, size=(10000, 7)) +#训练模型 +train_gan(data, noise, epochs=3000, batch_size=32) +# 数据生成 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\VHL\vhl_HPC.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/VHL/vhl_alibaba.py b/subject2-pre/all-process/generate_data/VHL/vhl_alibaba.py new file mode 100644 index 0000000000000000000000000000000000000000..e2ebbd2134fabe889662347eb384e4c9bca6f665 --- /dev/null +++ b/subject2-pre/all-process/generate_data/VHL/vhl_alibaba.py @@ -0,0 +1,141 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError +from scipy.spatial.distance import cdist + + + +data = pd.read_csv(r'F:\ourfl\data\processed_alibaba.csv') + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] + + +data['time'] = data['time'] % 24 + +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data1[features].values + + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + +def compute_joint_distribution_loss(real_samples, fake_samples): + # 计算真实样本和生成样本之间的欧氏距离 + distance = cdist(real_samples, fake_samples, metric='euclidean') + # 计算联合分布差异项 + joint_distribution_loss = distance.mean() + return joint_distribution_loss + +def train_gan(noise_data, real_data, epochs=3000, batch_size=16): + for epoch in range(epochs): + + noise = noise_data + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + + + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + + # 创建标签(真实数据样本为1,合成数据样本为0) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + + # 训练判别器 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + + # 训练生成器 + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss + 0.1 * joint_distribution_loss + + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +noise = np.random.normal(0, 1, size=(num_samples, 7)) +# 使用 data1 作为噪声数据集,data2 作为训练数据集训练 GAN 模型 +train_gan(data1_scaled, data2_scaled, epochs=3000, batch_size=32) + +num_samples = 20000 +num_rounds = 30 +num_hours_per_round = 24 + +# 生成虚拟数据 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\VHL\vhl_alibaba.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/VHL/vhl_google.py b/subject2-pre/all-process/generate_data/VHL/vhl_google.py new file mode 100644 index 0000000000000000000000000000000000000000..70f2259bce30b8ae66306405bfb4f604b453b408 --- /dev/null +++ b/subject2-pre/all-process/generate_data/VHL/vhl_google.py @@ -0,0 +1,141 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError +from scipy.spatial.distance import cdist + + + +data = pd.read_csv(r'F:\ourfl\data\pre_google.csv') +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] + + +data['time'] = data['time'] % 24 + +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data1[features].values + + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + +def compute_joint_distribution_loss(real_samples, fake_samples): + # 计算真实样本和生成样本之间的欧氏距离 + distance = cdist(real_samples, fake_samples, metric='euclidean') + # 计算联合分布差异项 + joint_distribution_loss = distance.mean() + return joint_distribution_loss + +def train_gan(noise_data, real_data, epochs=3000, batch_size=16): + for epoch in range(epochs): + + noise = noise_data + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + + + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + + # 创建标签(真实数据样本为1,合成数据样本为0) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + + # 训练判别器 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + + # 训练生成器 + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss + 0.1 * joint_distribution_loss + + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +noise = np.random.normal(0, 1, size=(num_samples, 7)) +# 使用 data1 作为噪声数据集,data2 作为训练数据集训练 GAN 模型 +train_gan(data1_scaled, data2_scaled, epochs=3000, batch_size=32) + +num_samples = 20000 +num_rounds = 30 +num_hours_per_round = 24 + +# 生成虚拟数据 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + + +generated_data.to_csv(r'F:\ourfl\genedate\VHL\vhl_google.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/VHL/vhl_hpc.py b/subject2-pre/all-process/generate_data/VHL/vhl_hpc.py new file mode 100644 index 0000000000000000000000000000000000000000..9f057a4037d2d2d91a5385af2cbe84d2d36904bc --- /dev/null +++ b/subject2-pre/all-process/generate_data/VHL/vhl_hpc.py @@ -0,0 +1,141 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError +from scipy.spatial.distance import cdist + + +data2 = pd.read_csv(r'F:\ourfl\data\processed_hpc.csv') + + +data = data[['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan']] + + +data['time'] = data['time'] % 24 + +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data1[features].values + + +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + +def compute_joint_distribution_loss(real_samples, fake_samples): + # 计算真实样本和生成样本之间的欧氏距离 + distance = cdist(real_samples, fake_samples, metric='euclidean') + # 计算联合分布差异项 + joint_distribution_loss = distance.mean() + return joint_distribution_loss + +def train_gan(noise_data, real_data, epochs=3000, batch_size=16): + for epoch in range(epochs): + + noise = noise_data + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + + + generated_samples = generator.predict(noise) + X = np.concatenate((real_samples, generated_samples)) + + # 创建标签(真实数据样本为1,合成数据样本为0) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + + # 训练判别器 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + + # 训练生成器 + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss + 0.1 * joint_distribution_loss + + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + +noise = np.random.normal(0, 1, size=(num_samples, 7)) +# 使用 data1 作为噪声数据集,data2 作为训练数据集训练 GAN 模型 +train_gan(data1_scaled, data2_scaled, epochs=3000, batch_size=32) + +num_samples = 20000 +num_rounds = 30 +num_hours_per_round = 24 + +# 生成虚拟数据 +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() +for i in range(1, 31): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data,empty_df.iloc[j*24+i:j*24+i+1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\VHL\vhl_hpc.csv', index=False) + + diff --git a/subject2-pre/all-process/generate_data/VHL/vhl_mircosoft.py b/subject2-pre/all-process/generate_data/VHL/vhl_mircosoft.py new file mode 100644 index 0000000000000000000000000000000000000000..922209158a24ff730806b60f36a67f88b7cff4d1 --- /dev/null +++ b/subject2-pre/all-process/generate_data/VHL/vhl_mircosoft.py @@ -0,0 +1,152 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from tensorflow.keras.losses import MeanSquaredError +from tensorflow.keras.metrics import RootMeanSquaredError +from scipy.spatial.distance import cdist + +data1 = pd.read_csv(r'F:\ourfl\data\processed_alibaba.csv') +data2 = pd.read_csv(r'F:\ourfl\data\pre_microsoft.csv') + + + +# 检查 data2 是否缺少特征,如果是,则用 data1 中相应特征的平均值来填充 +for column in data1.columns: + if column not in data2.columns: + data2[column] = data1[column].mean() + +# data2 = data2['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] + +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data1 = data1[features].values +data2 = data2[features].values + + + +scaler = MinMaxScaler(feature_range=(0, 1)) +data1_scaled = scaler.fit_transform(data1) +data2_scaled = scaler.fit_transform(data2) + + +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model + +generator = build_generator() + + +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model + +discriminator = build_discriminator() + +discriminator.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='mean_squared_error', optimizer='adam') + + +def compute_joint_distribution_loss(real_samples, fake_samples): + # 计算真实样本和生成样本之间的欧氏距离 + distance = cdist(real_samples, fake_samples, metric='euclidean') + # 计算联合分布差异项 + joint_distribution_loss = distance.mean() + return joint_distribution_loss + + +def train_gan(noise_data, real_data, epochs=3000, batch_size=32): + for epoch in range(epochs): + + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + + + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + + + generated_samples = generator.predict(noise) + + + X = np.concatenate((real_samples, generated_samples)) + + # 创建标签(真实数据样本为1,合成数据样本为0) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + + # 训练判别器 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + + # 训练生成器 + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss + 0.1 * joint_distribution_loss + + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + + + +num_samples = 20000 +num_rounds = 30 +num_hours_per_round = 24 + + +noise = np.random.normal(0, 1, size=(num_samples, 7)) +generated_data = generator.predict(noise) +generated_data = pd.DataFrame(generated_data, columns=features) + +# 将时间恢复到24小时制 +generated_data['time'] = (generated_data['time'] * 24).astype(int) + +data_by_hour = {} +# 将generated_data按时间分组,放入字典中对应的键中 +for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + +empty_df = pd.DataFrame() +# 遍历1到23小时,检查每个小时的数据是否为空 +for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + +for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + +# 复制empty_df的第一行数据,并赋值给time_series_data +time_series_data = empty_df.iloc[:1].copy() +for i in range(0, 30): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data, empty_df.iloc[i * 24 + j:i * 24 + j + 1].copy()]) +generated_data = time_series_data + +generated_data.to_csv(r'F:\ourfl\genedate\VHL\vhl_mircosoft.csv', index=False) + + diff --git a/subject2-pre/all-process/ourfl-nokd/ourfl_gan.py b/subject2-pre/all-process/ourfl-nokd/ourfl_gan.py new file mode 100644 index 0000000000000000000000000000000000000000..849332afdd759e6903657537b116bb4cf70c7506 --- /dev/null +++ b/subject2-pre/all-process/ourfl-nokd/ourfl_gan.py @@ -0,0 +1,237 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import MinMaxScaler +import torch.optim as optim # 添加导入torch.optim模块 + + +class Config(): + timestep = 1 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 4 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 2 # gru的层数 + epochs = 20 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0005 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +config = Config() + + +# 7.定义GRU网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size # 隐层大小 + self.num_layers = num_layers # gru层数 + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + # 初始化隐层状态 + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + # GRU运算 + output, h_0 = self.gru(x, h_0) + + # 获取GRU输出的维度信息 + batch_size, timestep, hidden_size = output.shape + + # 将output变成 batch_size * timestep, hidden_dim + output = output.reshape(-1, hidden_size) + + # 全连接层 + output = self.fc(output) + + # 转换维度,用于输出 + output = output.reshape(batch_size, timestep, -1) + + # 返回最后一个时间步的数据 + return output[:, -1, :] + + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] + dataY = [] + + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) + + dataX = np.array(dataX) + dataY = np.array(dataY) + + train_size = int(np.round(0.6 * dataX.shape[0])) + + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + + +data1 = pd.read_csv(r"F:\ourfl\data\processed_alibaba.csv") +data2 = pd.read_csv(r"F:\ourfl\data\pre_google.csv") +data3 = pd.read_csv(r"F:\ourfl\data\pre_microsoft.csv") +data4 = pd.read_csv(r"F:\ourfl\data\processed_hefei.csv") + +# 提取avg_mem列 +df1 = data1[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df2 = data2[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df3 = data3[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df4 = data4[['mem_use','cpu_plan','gpu_plan','mem_plan']] + +# 标准化数据 +scaler = MinMaxScaler(feature_range=(0, 1)) +data1 = scaler.fit_transform(df1.values) +data2 = scaler.fit_transform(df2.values) +data3 = scaler.fit_transform(df3.values) +data4 = scaler.fit_transform(df4.values) + +# 定义时间步数和特征数 +timestep = 1 # 您可以根据实际情况调整时间步数 +feature_size = 4 # 由于您已经提取了"mem_use"列作为特征,所以特征数为1 + +# 划分数据集 +x_train1, y_train1, x_test1, y_test1 = split_data(data1, timestep, feature_size) +x_train2, y_train2, x_test2, y_test2 = split_data(data2, timestep, feature_size) +x_train3, y_train3, x_test3, y_test3 = split_data(data3, timestep, feature_size) +x_train4, y_train4, x_test4, y_test4 = split_data(data4, timestep, feature_size) + + +class ClientGRU(nn.Module): + def __init__(self, config): + super(ClientGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers, batch_first=True) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + output, hidden = self.gru(x, hidden) + output = self.fc(output[:, -1, :]) # 调整输出维度 + return output + + + + #定义全局GRU模型 +class GlobalGRU(nn.Module): + def __init__(self, config): + super(GlobalGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + # 自定义GlobalGRU类的前向传播过程,使用GRU和Linear层 + output, hidden = self.gru(x, hidden) + output = self.fc(output) + return output, hidden + + + + +# 使用您之前定义的Config类来配置全局GRU模型和客户端GRU模型 +config = Config() +global_model = GlobalGRU(config) +client_models = [ClientGRU(config) for _ in range(4)] + + + +# 定义客户端训练函数 +def client_train(client_model, data, global_model_parameters, config): + # 将数据转换为PyTorch张量 + x_train, y_train, _, _ = data + x_train = torch.tensor(x_train, dtype=torch.float32) + y_train = torch.tensor(y_train, dtype=torch.float32) + # 设置优化器和损失函数 + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + # 设置初始隐藏状态(可根据需要调整) + hidden = None + # 将全局模型参数加载到客户端模型中 + client_model.load_state_dict(global_model_parameters) + # 进行本地训练 + for epoch in range(config.epochs): + optimizer.zero_grad() + # 前向传播 + output, hidden = client_model(x_train, hidden) + loss = criterion(output, y_train) + # 反向传播和参数更新 + loss.backward() + optimizer.step() + # 返回更新后的客户端模型参数 + return client_model.state_dict() + + + +# 定义全局模型参数平均函数 +def global_average_parameters(client_parameters_list): + num_parameters = len(list(client_parameters_list[0].values())) + sum_parameters = [torch.zeros_like(param) for param in list(client_parameters_list[0].values())] + + for client_parameters in client_parameters_list: + for i, param in enumerate(client_parameters.values()): + sum_parameters[i] += param + + averaged_parameters = [param_sum / len(client_parameters_list) for param_sum in sum_parameters] + return averaged_parameters + +# ... (其他代码保持不变) + +# 设置全局epoch数 +global_epoch = 30 +# 将数据转换为PyTorch张量 +# 将数据转换为PyTorch张量并指定数据类型为float32 +x_train_list = [torch.tensor(x_train1, dtype=torch.float32), + torch.tensor(x_train2, dtype=torch.float32), + torch.tensor(x_train3, dtype=torch.float32), + torch.tensor(x_train4, dtype=torch.float32)] +y_train_list = [torch.tensor(y_train1, dtype=torch.float32), + torch.tensor(y_train2, dtype=torch.float32), + torch.tensor(y_train3, dtype=torch.float32), + torch.tensor(y_train4, dtype=torch.float32)] + +# 进行联邦学习的全局epoch循环 +for global_epoch in range(global_epoch): + updated_client_parameters_list = [] + + # 客户端训练并更新模型参数 + for client_model, x_train, y_train in zip(client_models, x_train_list, y_train_list): + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + + # 进行本地模型的训练 + for local_epoch in range(config.epochs): + optimizer.zero_grad() + output = client_model(x_train) # 不再解包 + loss = criterion(output, y_train) + loss.backward() + optimizer.step() + + # 获取更新后的客户端模型参数 + updated_client_parameters_list.append(client_model.state_dict()) + + # 全局参数平均化 + averaged_parameters = global_average_parameters(updated_client_parameters_list) + + # 更新全局模型的参数为平均后的参数 + with torch.no_grad(): + for global_param, averaged_param in zip(global_model.parameters(), averaged_parameters): + global_param.copy_(averaged_param) + + +# 保存每个客户端模型 +for i, client_model in enumerate(client_models): + model_path = f"flclient_model_{i+1}.pth" + torch.save(client_model.state_dict(), model_path) + diff --git a/subject2-pre/all-process/ourfl-nokd/ourfl_tvhl.py b/subject2-pre/all-process/ourfl-nokd/ourfl_tvhl.py new file mode 100644 index 0000000000000000000000000000000000000000..7781db241dc2a9842a7c569efd25a088e949f506 --- /dev/null +++ b/subject2-pre/all-process/ourfl-nokd/ourfl_tvhl.py @@ -0,0 +1,238 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import MinMaxScaler +import torch.optim as optim # 添加导入torch.optim模块 + + +class Config(): + data_path = './data/wind_dataset.csv' + timestep = 4 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 4 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 2 # gru的层数 + epochs = 20 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0001 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +config = Config() + + +# 7.定义GRU网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size # 隐层大小 + self.num_layers = num_layers # gru层数 + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + # 初始化隐层状态 + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + # GRU运算 + output, h_0 = self.gru(x, h_0) + + # 获取GRU输出的维度信息 + batch_size, timestep, hidden_size = output.shape + + # 将output变成 batch_size * timestep, hidden_dim + output = output.reshape(-1, hidden_size) + + # 全连接层 + output = self.fc(output) + + # 转换维度,用于输出 + output = output.reshape(batch_size, timestep, -1) + + # 返回最后一个时间步的数据 + return output[:, -1, :] + + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] + dataY = [] + + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) + + dataX = np.array(dataX) + dataY = np.array(dataY) + + train_size = int(np.round(0.6 * dataX.shape[0])) + + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + + +data1 = pd.read_csv(r'F:\ourfl\genedate\TVHL\tvhl_alibaba.csv') +data2 = pd.read_csv(r'F:\ourfl\genedate\TVHL\tvhl_google.csv') +data3 = pd.read_csv(r'F:\ourfl\genedate\TVHL\tvhl_mircosoft.csv') +data4 = pd.read_csv(r'F:\ourfl\genedate\TVHL\tvhl_sugan1.csv') + +# 提取avg_mem列 +df1 = data1[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df2 = data2[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df3 = data3[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df4 = data4[['mem_use','cpu_plan','gpu_plan','mem_plan']] + +# 标准化数据 +scaler = MinMaxScaler(feature_range=(0, 1)) +data1 = scaler.fit_transform(df1.values) +data2 = scaler.fit_transform(df2.values) +data3 = scaler.fit_transform(df3.values) +data4 = scaler.fit_transform(df4.values) + +# 定义时间步数和特征数 +timestep = 4 # 您可以根据实际情况调整时间步数 +feature_size = 4 # 由于您已经提取了"mem_use"列作为特征,所以特征数为1 + +# 划分数据集 +x_train1, y_train1, x_test1, y_test1 = split_data(data1, timestep, feature_size) +x_train2, y_train2, x_test2, y_test2 = split_data(data2, timestep, feature_size) +x_train3, y_train3, x_test3, y_test3 = split_data(data3, timestep, feature_size) +x_train4, y_train4, x_test4, y_test4 = split_data(data4, timestep, feature_size) + + +class ClientGRU(nn.Module): + def __init__(self, config): + super(ClientGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers, batch_first=True) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + output, hidden = self.gru(x, hidden) + output = self.fc(output[:, -1, :]) # 调整输出维度 + return output + + + + #定义全局GRU模型 +class GlobalGRU(nn.Module): + def __init__(self, config): + super(GlobalGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + # 自定义GlobalGRU类的前向传播过程,使用GRU和Linear层 + output, hidden = self.gru(x, hidden) + output = self.fc(output) + return output, hidden + + + + +# 使用您之前定义的Config类来配置全局GRU模型和客户端GRU模型 +config = Config() +global_model = GlobalGRU(config) +client_models = [ClientGRU(config) for _ in range(4)] + + + +# 定义客户端训练函数 +def client_train(client_model, data, global_model_parameters, config): + # 将数据转换为PyTorch张量 + x_train, y_train, _, _ = data + x_train = torch.tensor(x_train, dtype=torch.float32) + y_train = torch.tensor(y_train, dtype=torch.float32) + # 设置优化器和损失函数 + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + # 设置初始隐藏状态(可根据需要调整) + hidden = None + # 将全局模型参数加载到客户端模型中 + client_model.load_state_dict(global_model_parameters) + # 进行本地训练 + for epoch in range(config.epochs): + optimizer.zero_grad() + # 前向传播 + output, hidden = client_model(x_train, hidden) + loss = criterion(output, y_train) + # 反向传播和参数更新 + loss.backward() + optimizer.step() + # 返回更新后的客户端模型参数 + return client_model.state_dict() + + + +# 定义全局模型参数平均函数 +def global_average_parameters(client_parameters_list): + num_parameters = len(list(client_parameters_list[0].values())) + sum_parameters = [torch.zeros_like(param) for param in list(client_parameters_list[0].values())] + + for client_parameters in client_parameters_list: + for i, param in enumerate(client_parameters.values()): + sum_parameters[i] += param + + averaged_parameters = [param_sum / len(client_parameters_list) for param_sum in sum_parameters] + return averaged_parameters + +# ... (其他代码保持不变) + +# 设置全局epoch数 +global_epoch = 30 +# 将数据转换为PyTorch张量 +# 将数据转换为PyTorch张量并指定数据类型为float32 +x_train_list = [torch.tensor(x_train1, dtype=torch.float32), + torch.tensor(x_train2, dtype=torch.float32), + torch.tensor(x_train3, dtype=torch.float32), + torch.tensor(x_train4, dtype=torch.float32)] +y_train_list = [torch.tensor(y_train1, dtype=torch.float32), + torch.tensor(y_train2, dtype=torch.float32), + torch.tensor(y_train3, dtype=torch.float32), + torch.tensor(y_train4, dtype=torch.float32)] + +# 进行联邦学习的全局epoch循环 +for global_epoch in range(global_epoch): + updated_client_parameters_list = [] + + # 客户端训练并更新模型参数 + for client_model, x_train, y_train in zip(client_models, x_train_list, y_train_list): + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + + # 进行本地模型的训练 + for local_epoch in range(config.epochs): + optimizer.zero_grad() + output = client_model(x_train) # 不再解包 + loss = criterion(output, y_train) + loss.backward() + optimizer.step() + + # 获取更新后的客户端模型参数 + updated_client_parameters_list.append(client_model.state_dict()) + + # 全局参数平均化 + averaged_parameters = global_average_parameters(updated_client_parameters_list) + + # 更新全局模型的参数为平均后的参数 + with torch.no_grad(): + for global_param, averaged_param in zip(global_model.parameters(), averaged_parameters): + global_param.copy_(averaged_param) + + +# 保存每个客户端模型 +for i, client_model in enumerate(client_models): + model_path = f"fltvhlclient_model_{i+1}.pth" + torch.save(client_model.state_dict(), model_path) + diff --git a/subject2-pre/all-process/ourfl-nokd/ourfl_vhl.py b/subject2-pre/all-process/ourfl-nokd/ourfl_vhl.py new file mode 100644 index 0000000000000000000000000000000000000000..8b36f5fcbf0084ea9835d635ee8b3dce04e69f8a --- /dev/null +++ b/subject2-pre/all-process/ourfl-nokd/ourfl_vhl.py @@ -0,0 +1,238 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import MinMaxScaler +import torch.optim as optim # 添加导入torch.optim模块 + + +class Config(): + data_path = './data/wind_dataset.csv' + timestep = 1 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 4 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 2 # gru的层数 + epochs = 10 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0003 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +config = Config() + + +# 7.定义GRU网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size # 隐层大小 + self.num_layers = num_layers # gru层数 + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + # 初始化隐层状态 + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + # GRU运算 + output, h_0 = self.gru(x, h_0) + + # 获取GRU输出的维度信息 + batch_size, timestep, hidden_size = output.shape + + # 将output变成 batch_size * timestep, hidden_dim + output = output.reshape(-1, hidden_size) + + # 全连接层 + output = self.fc(output) + + # 转换维度,用于输出 + output = output.reshape(batch_size, timestep, -1) + + # 返回最后一个时间步的数据 + return output[:, -1, :] + + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] + dataY = [] + + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) + + dataX = np.array(dataX) + dataY = np.array(dataY) + + train_size = int(np.round(0.6 * dataX.shape[0])) + + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + + +data1 = pd.read_csv(r'F:\ourfl\genedate\VHL\vhl_alibaba.csv') +data2 = pd.read_csv(r'F:\ourfl\genedate\VHL\vhl_google.csv') +data3 = pd.read_csv(r'F:\ourfl\genedate\VHL\vhl_mircosoft.csv') +data4 = pd.read_csv(r'F:\ourfl\genedate\VHL\vhl_sugan1.csv') + +# 提取avg_mem列 +df1 = data1[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df2 = data2[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df3 = data3[['mem_use','cpu_plan','gpu_plan','mem_plan']] +df4 = data4[['mem_use','cpu_plan','gpu_plan','mem_plan']] + +# 标准化数据 +scaler = MinMaxScaler(feature_range=(0, 1)) +data1 = scaler.fit_transform(df1.values) +data2 = scaler.fit_transform(df2.values) +data3 = scaler.fit_transform(df3.values) +data4 = scaler.fit_transform(df4.values) + +# 定义时间步数和特征数 +timestep = 1 # 您可以根据实际情况调整时间步数 +feature_size = 4 # 由于您已经提取了"mem_use"列作为特征,所以特征数为1 + +# 划分数据集 +x_train1, y_train1, x_test1, y_test1 = split_data(data1, timestep, feature_size) +x_train2, y_train2, x_test2, y_test2 = split_data(data2, timestep, feature_size) +x_train3, y_train3, x_test3, y_test3 = split_data(data3, timestep, feature_size) +x_train4, y_train4, x_test4, y_test4 = split_data(data4, timestep, feature_size) + + +class ClientGRU(nn.Module): + def __init__(self, config): + super(ClientGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers, batch_first=True) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + output, hidden = self.gru(x, hidden) + output = self.fc(output[:, -1, :]) # 调整输出维度 + return output + + + + #定义全局GRU模型 +class GlobalGRU(nn.Module): + def __init__(self, config): + super(GlobalGRU, self).__init__() + self.gru = nn.GRU(config.feature_size, config.hidden_size, config.num_layers) + self.fc = nn.Linear(config.hidden_size, config.output_size) + + def forward(self, x, hidden=None): + # 自定义GlobalGRU类的前向传播过程,使用GRU和Linear层 + output, hidden = self.gru(x, hidden) + output = self.fc(output) + return output, hidden + + + + +# 使用您之前定义的Config类来配置全局GRU模型和客户端GRU模型 +config = Config() +global_model = GlobalGRU(config) +client_models = [ClientGRU(config) for _ in range(4)] + + + +# 定义客户端训练函数 +def client_train(client_model, data, global_model_parameters, config): + # 将数据转换为PyTorch张量 + x_train, y_train, _, _ = data + x_train = torch.tensor(x_train, dtype=torch.float32) + y_train = torch.tensor(y_train, dtype=torch.float32) + # 设置优化器和损失函数 + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + # 设置初始隐藏状态(可根据需要调整) + hidden = None + # 将全局模型参数加载到客户端模型中 + client_model.load_state_dict(global_model_parameters) + # 进行本地训练 + for epoch in range(config.epochs): + optimizer.zero_grad() + # 前向传播 + output, hidden = client_model(x_train, hidden) + loss = criterion(output, y_train) + # 反向传播和参数更新 + loss.backward() + optimizer.step() + # 返回更新后的客户端模型参数 + return client_model.state_dict() + + + +# 定义全局模型参数平均函数 +def global_average_parameters(client_parameters_list): + num_parameters = len(list(client_parameters_list[0].values())) + sum_parameters = [torch.zeros_like(param) for param in list(client_parameters_list[0].values())] + + for client_parameters in client_parameters_list: + for i, param in enumerate(client_parameters.values()): + sum_parameters[i] += param + + averaged_parameters = [param_sum / len(client_parameters_list) for param_sum in sum_parameters] + return averaged_parameters + +# ... (其他代码保持不变) + +# 设置全局epoch数 +global_epoch = 50 +# 将数据转换为PyTorch张量 +# 将数据转换为PyTorch张量并指定数据类型为float32 +x_train_list = [torch.tensor(x_train1, dtype=torch.float32), + torch.tensor(x_train2, dtype=torch.float32), + torch.tensor(x_train3, dtype=torch.float32), + torch.tensor(x_train4, dtype=torch.float32)] +y_train_list = [torch.tensor(y_train1, dtype=torch.float32), + torch.tensor(y_train2, dtype=torch.float32), + torch.tensor(y_train3, dtype=torch.float32), + torch.tensor(y_train4, dtype=torch.float32)] + +# 进行联邦学习的全局epoch循环 +for global_epoch in range(global_epoch): + updated_client_parameters_list = [] + + # 客户端训练并更新模型参数 + for client_model, x_train, y_train in zip(client_models, x_train_list, y_train_list): + optimizer = optim.Adam(client_model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + + # 进行本地模型的训练 + for local_epoch in range(config.epochs): + optimizer.zero_grad() + output = client_model(x_train) # 不再解包 + loss = criterion(output, y_train) + loss.backward() + optimizer.step() + + # 获取更新后的客户端模型参数 + updated_client_parameters_list.append(client_model.state_dict()) + + # 全局参数平均化 + averaged_parameters = global_average_parameters(updated_client_parameters_list) + + # 更新全局模型的参数为平均后的参数 + with torch.no_grad(): + for global_param, averaged_param in zip(global_model.parameters(), averaged_parameters): + global_param.copy_(averaged_param) + + +# 保存每个客户端模型 +for i, client_model in enumerate(client_models): + model_path = f"flvhlclient_model_{i+1}.pth" + torch.save(client_model.state_dict(), model_path) + diff --git a/subject2-pre/all-process/ourfl_kd/a1.py b/subject2-pre/all-process/ourfl_kd/a1.py new file mode 100644 index 0000000000000000000000000000000000000000..a5ddab0c7210cde31c9b474e424c3505b4fb31dc --- /dev/null +++ b/subject2-pre/all-process/ourfl_kd/a1.py @@ -0,0 +1,300 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from torch.utils.data import TensorDataset +from tqdm import tqdm +from sklearn.metrics import mean_squared_error, mean_absolute_error + + +data1 = pd.read_csv(r'F:\ourfl\data\processed_alibaba.csv') +df = data1[['cpu_use']] +# 2.将数据进行标准化 +scaler = MinMaxScaler(feature_range=(0, 1)) +data = scaler.fit_transform(df.values) + + + +class Config(): + + timestep = 2 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 1 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 2 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0001 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +config = Config() + +#新模型 +class newConfig(): + + timestep = 2 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 1 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 512 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 2 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0001 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +newconfig = newConfig() + +# 形成训练数据 +def split_data1(data, timestep, feature_size): + dataX = [] # 保存X + dataY = [] # 保存Y + + # 将整个窗口的数据保存到X中,将未来一天保存到Y中 + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) # 目标列avg_mem为第5列,索引为4 + + dataX = np.array(dataX) + dataY = np.array(dataY) + + return dataX, dataY + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] # 保存X + dataY = [] # 保存Y + + # 将整个窗口的数据保存到X中,将未来一天保存到Y中 + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) # 目标列avg_mem为第5列,索引为4 + + dataX = np.array(dataX) + dataY = np.array(dataY) + + # 获取训练集大小 + train_size = int(np.round(0.6 * dataX.shape[0])) + + # 划分训练集、测试集 + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + +# 定义GRU网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size # 隐层大小 + self.num_layers = num_layers # gru层数 + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + # 初始化隐层状态 + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + # GRU运算 + output, h_0 = self.gru(x, h_0) + + # 获取GRU输出的维度信息 + batch_size, timestep, hidden_size = output.shape + + # 将output变成 batch_size * timestep, hidden_dim + output = output.reshape(-1, hidden_size) + + # 全连接层 + output = self.fc(output) + + # 转换维度,用于输出 + output = output.reshape(batch_size, timestep, -1) + + # 返回最后一个时间步的数据 + return output[:, -1, :] + + +# 获取训练数据 +x_train, y_train, x_test, y_test = split_data(data, config.timestep, config.feature_size) + +x_test1, y_test1 = split_data1(data, config.timestep, config.feature_size) + +# 4.将数据转为tensor +x_train_tensor = torch.from_numpy(x_train).to(torch.float32) +y_train_tensor = torch.from_numpy(y_train).to(torch.float32) +x_test_tensor = torch.from_numpy(x_test).to(torch.float32) +y_test_tensor = torch.from_numpy(y_test).to(torch.float32) + +# 5.形成训练数据集 +train_data = TensorDataset(x_train_tensor, y_train_tensor) +test_data = TensorDataset(x_test_tensor, y_test_tensor) + +# 6.将数据加载成迭代器 +train_loader = torch.utils.data.DataLoader(train_data, + config.batch_size, + shuffle=False) + +test_loader = torch.utils.data.DataLoader(test_data, + config.batch_size, + shuffle=False) + +# 假设您的模型类是GRU,config中包含相关的模型配置信息 +pre_trained_model = GRU(config.feature_size, config.hidden_size, config.num_layers, config.output_size) + +# 加载预训练模型的参数 +# model_path = r"F:\ourfl\fedavg\tvhl_client_model_1.pth" +model_path = r"F:\ourfl\fedavg\client_model_1.pth" + +pre_trained_model.load_state_dict(torch.load(model_path)) + +transfer_model = GRU(newconfig.feature_size, newconfig.hidden_size, newconfig.num_layers, newconfig.output_size) # 定义GRU网络 +loss_function = nn.MSELoss() # 定义损失函数 +optimizer_transfer = torch.optim.AdamW(transfer_model.parameters(), lr= newconfig.learning_rate) # 定义优化器 + +# 定义生成随机矩阵的数量 +num_random_matrices = 30 + +# 获取预训练模型和新模型的参数 +pre_trained_params = pre_trained_model.state_dict() +transfer_params = transfer_model.state_dict() # 使用之前定义的模型 model,而不是新的 transfer_model + +# 计算预训练模型参数的 L2 范数 +pre_trained_l2_norm = torch.norm(torch.stack([torch.norm(param) for param in pre_trained_params.values()])) + +# 计算新模型参数的 L2 范数 +transfer_l2_norm = torch.norm(torch.stack([torch.norm(param) for param in transfer_params.values()])) + +# 初始化最小 L2 范数和对应的随机矩阵 +min_l2_norm = float('inf') +min_l2_norm_random_matrix = None + +# 循环生成随机矩阵,并计算 L2 范数 +for i in range(num_random_matrices): + # 生成随机矩阵 + random_matrix = torch.rand(pre_trained_l2_norm.size()) + + # 应用随机矩阵到预训练模型参数 + for name, param in pre_trained_params.items(): + if "gru" in name: + name = name.replace("gru.", "") # 移除模型名称前缀 + if name in transfer_params: + transfer_params[name].copy_(param * random_matrix) + + # 计算使用当前随机矩阵映射后的新模型参数的 L2 范数 + transfer_l2_norm_after_mapping = torch.norm(torch.stack([torch.norm(param) for param in transfer_params.values()])) + + # 更新最小 L2 范数和对应的随机矩阵 + if transfer_l2_norm_after_mapping < min_l2_norm: + min_l2_norm = transfer_l2_norm_after_mapping + min_l2_norm_random_matrix = random_matrix + +# 将最优的随机矩阵应用于预训练模型参数 +for name, param in pre_trained_params.items(): + if "gru" in name: + name = name.replace("gru.", "") # 移除模型名称前缀 + if name in transfer_params: + transfer_params[name].copy_(param * min_l2_norm_random_matrix) + + +num_transfer_epochs = 30 # 根据需要设置合适的训练轮数 + + +for param in pre_trained_model.parameters(): + param.requires_grad = False + + +for epoch in range(num_transfer_epochs): + transfer_model.train() + running_loss = 0 + train_bar = tqdm(train_loader) # 使用之前定义的数据加载器 train_loader + for data in train_bar: + x_train, y_train = data + optimizer_transfer.zero_grad() + y_train_pred = transfer_model(x_train) + y_pretrained_pred = pre_trained_model(x_train) + loss = loss_function(y_train_pred, y_train) + loss_function(y_train_pred, y_pretrained_pred) + loss.backward() + optimizer_transfer.step() + + running_loss += loss.item() + train_bar.set_description("Transfer Learning epoch[{}/{}] loss:{:.3f}".format(epoch + 1, num_transfer_epochs, loss.item())) + + + +transfer_model_save_path = './transfer_model.pth' +torch.save(transfer_model.state_dict(), transfer_model_save_path) + +print('Finished Transfer Learning') + +transfer_model.eval() + +# Combine the training and test data to use the entire dataset +x_combined = np.concatenate([x_train, x_test], axis=0) +y_combined = np.concatenate([y_train, y_test], axis=0) + +# Convert the combined data to tensors +x_combined_tensor = torch.from_numpy(x_combined).to(torch.float32) +y_combined_tensor = torch.from_numpy(y_combined).to(torch.float32) + +# Create a new DataLoader for the combined data +combined_data = TensorDataset(x_combined_tensor, y_combined_tensor) +combined_loader = torch.utils.data.DataLoader(combined_data, batch_size=config.batch_size, shuffle=False) + +with torch.no_grad(): + y_true = [] # Ground truth labels + y_pred = [] # Predicted labels + for data in test_loader: # Using the test data loader + x_test1, y_test1 = data + predictions = transfer_model(x_test1) + y_true.extend(y_test1.numpy().flatten()) + y_pred.extend(predictions.numpy().flatten()) + +# Convert the lists to numpy arrays for easier calculations +y_true = np.array(y_true) +y_pred = np.array(y_pred) + +# Calculate evaluation metrics +mse = mean_squared_error(y_true, y_pred) +rmse = np.sqrt(mse) +mae = mean_absolute_error(y_true, y_pred) + +# Calculate MAPE and SMAPE with checks for division by zero +mape_denominator = np.maximum(np.abs(y_true), 1e-8) # Avoid division by zero +mape = np.mean(np.abs((y_true - y_pred) / mape_denominator)) * 100 + +smape_denominator = np.maximum(np.abs(y_true) + np.abs(y_pred), 1e-8) # Avoid division by zero +smape = 2.0 * np.mean(np.abs(y_pred - y_true) / smape_denominator) * 100 + + +print("Mean Squared Error (MSE):", mse) +print("Root Mean Squared Error (RMSE):", rmse) +print("Mean Absolute Error (MAE):", mae) +print("Mean Absolute Percentage Error (MAPE):", mape) +print("Symmetric Mean Absolute Percentage Error (SMAPE):", smape) + +# Save predictions and metrics to CSV +result_df = pd.DataFrame({'Truth': y_true, 'Predicted': y_pred}) +result_df.to_csv('transfer_learning_predictions.csv', index=False) + +metrics_df = pd.DataFrame({ + 'MSE': [mse], + 'RMSE': [rmse], + 'MAE': [mae], + 'MAPE': [mape], + 'SMAPE': [smape] +}) +metrics_df.to_csv('transfer_learning_metrics.csv', index=False) + +print("Transfer learning predictions and metrics saved successfully!") diff --git a/subject2-pre/all-process/ourfl_kd/g1.py b/subject2-pre/all-process/ourfl_kd/g1.py new file mode 100644 index 0000000000000000000000000000000000000000..46aef3e49ff0f6e8d5f933ddc8c7fefc2b3b08a3 --- /dev/null +++ b/subject2-pre/all-process/ourfl_kd/g1.py @@ -0,0 +1,301 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from torch.utils.data import TensorDataset +from tqdm import tqdm +from sklearn.metrics import mean_squared_error, mean_absolute_error + + +data1 = pd.read_csv(r'F:\ourfl\data\pre_google.csv') +df = data1[['cpu_use']] +# 2.将数据进行标准化 +scaler = MinMaxScaler(feature_range=(0, 1)) +data = scaler.fit_transform(df.values) + +u = 0.5 + +num_transfer_epochs = 30 # 根据需要设置合适的训练轮数 + + +class Config(): + + timestep = 2 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 1 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 2 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0005 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +config = Config() + +#新模型 +class newConfig(): + + timestep = 2 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 1 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 512 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 4 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0001 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +newconfig = newConfig() + +# 形成训练数据 +def split_data1(data, timestep, feature_size): + dataX = [] # 保存X + dataY = [] # 保存Y + + # 将整个窗口的数据保存到X中,将未来一天保存到Y中 + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) # 目标列avg_mem为第5列,索引为4 + + dataX = np.array(dataX) + dataY = np.array(dataY) + + return dataX, dataY + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] # 保存X + dataY = [] # 保存Y + + # 将整个窗口的数据保存到X中,将未来一天保存到Y中 + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) # 目标列avg_mem为第5列,索引为4 + + dataX = np.array(dataX) + dataY = np.array(dataY) + + # 获取训练集大小 + train_size = int(np.round(0.6 * dataX.shape[0])) + + # 划分训练集、测试集 + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + +# 定义GRU网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size # 隐层大小 + self.num_layers = num_layers # gru层数 + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + # 初始化隐层状态 + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + # GRU运算 + output, h_0 = self.gru(x, h_0) + + # 获取GRU输出的维度信息 + batch_size, timestep, hidden_size = output.shape + + # 将output变成 batch_size * timestep, hidden_dim + output = output.reshape(-1, hidden_size) + + # 全连接层 + output = self.fc(output) + + # 转换维度,用于输出 + output = output.reshape(batch_size, timestep, -1) + + # 返回最后一个时间步的数据 + return output[:, -1, :] + + +# 获取训练数据 +x_train, y_train, x_test, y_test = split_data(data, config.timestep, config.feature_size) + +x_test1, y_test1 = split_data1(data, config.timestep, config.feature_size) + +# 4.将数据转为tensor +x_train_tensor = torch.from_numpy(x_train).to(torch.float32) +y_train_tensor = torch.from_numpy(y_train).to(torch.float32) +x_test_tensor = torch.from_numpy(x_test).to(torch.float32) +y_test_tensor = torch.from_numpy(y_test).to(torch.float32) + +# 5.形成训练数据集 +train_data = TensorDataset(x_train_tensor, y_train_tensor) +test_data = TensorDataset(x_test_tensor, y_test_tensor) + +# 6.将数据加载成迭代器 +train_loader = torch.utils.data.DataLoader(train_data, + config.batch_size, + shuffle=False) + +test_loader = torch.utils.data.DataLoader(test_data, + config.batch_size, + shuffle=False) + +# 假设您的模型类是GRU,config中包含相关的模型配置信息 +pre_trained_model = GRU(config.feature_size, config.hidden_size, config.num_layers, config.output_size) + +# 加载预训练模型的参数 +# model_path = r"F:\ourfl\fedavg\tvhl_client_model_1.pth" +model_path = r"F:\ourfl\fedavg\client_model_2.pth" + +pre_trained_model.load_state_dict(torch.load(model_path)) + +transfer_model = GRU(newconfig.feature_size, newconfig.hidden_size, newconfig.num_layers, newconfig.output_size) # 定义GRU网络 +loss_function = nn.MSELoss() # 定义损失函数 +optimizer_transfer = torch.optim.AdamW(transfer_model.parameters(), lr= newconfig.learning_rate) # 定义优化器 + +# 定义生成随机矩阵的数量 +num_random_matrices = 100 + +# 获取预训练模型和新模型的参数 +pre_trained_params = pre_trained_model.state_dict() +transfer_params = transfer_model.state_dict() # 使用之前定义的模型 model,而不是新的 transfer_model + +# 计算预训练模型参数的 L2 范数 +pre_trained_l2_norm = torch.norm(torch.stack([torch.norm(param) for param in pre_trained_params.values()])) + +# 计算新模型参数的 L2 范数 +transfer_l2_norm = torch.norm(torch.stack([torch.norm(param) for param in transfer_params.values()])) + +# 初始化最小 L2 范数和对应的随机矩阵 +min_l2_norm = float('inf') +min_l2_norm_random_matrix = None + +# 循环生成随机矩阵,并计算 L2 范数 +for i in range(num_random_matrices): + # 生成随机矩阵 + random_matrix = torch.rand(pre_trained_l2_norm.size()) + + # 应用随机矩阵到预训练模型参数 + for name, param in pre_trained_params.items(): + if "gru" in name: + name = name.replace("gru.", "") # 移除模型名称前缀 + if name in transfer_params: + transfer_params[name].copy_(param * random_matrix) + + # 计算使用当前随机矩阵映射后的新模型参数的 L2 范数 + transfer_l2_norm_after_mapping = torch.norm(torch.stack([torch.norm(param) for param in transfer_params.values()])) + + # 更新最小 L2 范数和对应的随机矩阵 + if transfer_l2_norm_after_mapping < min_l2_norm: + min_l2_norm = transfer_l2_norm_after_mapping + min_l2_norm_random_matrix = random_matrix + +# 将最优的随机矩阵应用于预训练模型参数 +for name, param in pre_trained_params.items(): + if "gru" in name: + name = name.replace("gru.", "") # 移除模型名称前缀 + if name in transfer_params: + transfer_params[name].copy_(param * min_l2_norm_random_matrix) + + +# 将已有模型的参数设为不可训练 +for param in pre_trained_model.parameters(): + param.requires_grad = False + + +for epoch in range(num_transfer_epochs): + transfer_model.train() + running_loss = 0 + train_bar = tqdm(train_loader) # 使用之前定义的数据加载器 train_loader + for data in train_bar: + x_train, y_train = data + optimizer_transfer.zero_grad() + y_train_pred = transfer_model(x_train) + y_pretrained_pred = pre_trained_model(x_train) + loss = (1-u)*loss_function(y_train_pred, y_train) + u * loss_function(y_train_pred, y_pretrained_pred) + loss.backward() + optimizer_transfer.step() + + running_loss += loss.item() + train_bar.set_description("Transfer Learning epoch[{}/{}] loss:{:.3f}".format(epoch + 1, num_transfer_epochs, loss.item())) + + + +transfer_model_save_path = './transfer_model2.pth' +torch.save(transfer_model.state_dict(), transfer_model_save_path) + +print('Finished Transfer Learning') + +transfer_model.eval() + +# Combine the training and test data to use the entire dataset +x_combined = np.concatenate([x_train, x_test], axis=0) +y_combined = np.concatenate([y_train, y_test], axis=0) + +# Convert the combined data to tensors +x_combined_tensor = torch.from_numpy(x_combined).to(torch.float32) +y_combined_tensor = torch.from_numpy(y_combined).to(torch.float32) + +# Create a new DataLoader for the combined data +combined_data = TensorDataset(x_combined_tensor, y_combined_tensor) +combined_loader = torch.utils.data.DataLoader(combined_data, batch_size=config.batch_size, shuffle=False) + +with torch.no_grad(): + y_true = [] # Ground truth labels + y_pred = [] # Predicted labels + for data in test_loader: # Using the test data loader + x_test1, y_test1 = data + predictions = transfer_model(x_test1) + y_true.extend(y_test1.numpy().flatten()) + y_pred.extend(predictions.numpy().flatten()) + +# Convert the lists to numpy arrays for easier calculations +y_true = np.array(y_true) +y_pred = np.array(y_pred) + +# Calculate evaluation metrics +mse = mean_squared_error(y_true, y_pred) +rmse = np.sqrt(mse) +mae = mean_absolute_error(y_true, y_pred) + +# Calculate MAPE and SMAPE with checks for division by zero +mape_denominator = np.maximum(np.abs(y_true), 1e-8) # Avoid division by zero +mape = np.mean(np.abs((y_true - y_pred) / mape_denominator)) * 100 + +smape_denominator = np.maximum(np.abs(y_true) + np.abs(y_pred), 1e-8) # Avoid division by zero +smape = 2.0 * np.mean(np.abs(y_pred - y_true) / smape_denominator) * 100 + + +print("Mean Squared Error (MSE):", mse) +print("Root Mean Squared Error (RMSE):", rmse) +print("Mean Absolute Error (MAE):", mae) +print("Mean Absolute Percentage Error (MAPE):", mape) +print("Symmetric Mean Absolute Percentage Error (SMAPE):", smape) + +# Save predictions and metrics to CSV +result_df = pd.DataFrame({'Truth': y_true, 'Predicted': y_pred}) +result_df.to_csv('transfer_learning_predictions2.csv', index=False) + +metrics_df = pd.DataFrame({ + 'MSE': [mse], + 'RMSE': [rmse], + 'MAE': [mae], + 'MAPE': [mape], + 'SMAPE': [smape] +}) +metrics_df.to_csv('transfer_learning_metrics2.csv', index=False) + +print("Transfer learning predictions and metrics saved successfully!") diff --git a/subject2-pre/all-process/ourfl_kd/m1.py b/subject2-pre/all-process/ourfl_kd/m1.py new file mode 100644 index 0000000000000000000000000000000000000000..2e4f66ce7b05e71ad2a097c0ae43d57044324b2d --- /dev/null +++ b/subject2-pre/all-process/ourfl_kd/m1.py @@ -0,0 +1,299 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from torch.utils.data import TensorDataset +from tqdm import tqdm +from sklearn.metrics import mean_squared_error, mean_absolute_error + + +data1 = pd.read_csv(r'F:\ourfl\data\pre_microsoft.csv') +df = data1[['cpu_use']] +# 2.将数据进行标准化 +scaler = MinMaxScaler(feature_range=(0, 1)) +data = scaler.fit_transform(df.values) + +u = 0.5 +num_transfer_epochs = 30 # 根据需要设置合适的训练轮数 + +class Config(): + + timestep = 2 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 1 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 2 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0005 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +config = Config() + +#新模型 +class newConfig(): + + timestep = 2 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 1 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 4 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0001 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +newconfig = newConfig() + +# 形成训练数据 +def split_data1(data, timestep, feature_size): + dataX = [] # 保存X + dataY = [] # 保存Y + + # 将整个窗口的数据保存到X中,将未来一天保存到Y中 + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) # 目标列avg_mem为第5列,索引为4 + + dataX = np.array(dataX) + dataY = np.array(dataY) + + return dataX, dataY + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] # 保存X + dataY = [] # 保存Y + + # 将整个窗口的数据保存到X中,将未来一天保存到Y中 + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) # 目标列avg_mem为第5列,索引为4 + + dataX = np.array(dataX) + dataY = np.array(dataY) + + # 获取训练集大小 + train_size = int(np.round(0.6 * dataX.shape[0])) + + # 划分训练集、测试集 + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + +# 定义GRU网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size # 隐层大小 + self.num_layers = num_layers # gru层数 + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + # 初始化隐层状态 + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + # GRU运算 + output, h_0 = self.gru(x, h_0) + + # 获取GRU输出的维度信息 + batch_size, timestep, hidden_size = output.shape + + # 将output变成 batch_size * timestep, hidden_dim + output = output.reshape(-1, hidden_size) + + # 全连接层 + output = self.fc(output) + + # 转换维度,用于输出 + output = output.reshape(batch_size, timestep, -1) + + # 返回最后一个时间步的数据 + return output[:, -1, :] + + +# 获取训练数据 +x_train, y_train, x_test, y_test = split_data(data, config.timestep, config.feature_size) + +x_test1, y_test1 = split_data1(data, config.timestep, config.feature_size) + +# 4.将数据转为tensor +x_train_tensor = torch.from_numpy(x_train).to(torch.float32) +y_train_tensor = torch.from_numpy(y_train).to(torch.float32) +x_test_tensor = torch.from_numpy(x_test).to(torch.float32) +y_test_tensor = torch.from_numpy(y_test).to(torch.float32) + +# 5.形成训练数据集 +train_data = TensorDataset(x_train_tensor, y_train_tensor) +test_data = TensorDataset(x_test_tensor, y_test_tensor) + +# 6.将数据加载成迭代器 +train_loader = torch.utils.data.DataLoader(train_data, + config.batch_size, + shuffle=False) + +test_loader = torch.utils.data.DataLoader(test_data, + config.batch_size, + shuffle=False) + +# 假设您的模型类是GRU,config中包含相关的模型配置信息 +pre_trained_model = GRU(config.feature_size, config.hidden_size, config.num_layers, config.output_size) + +# 加载预训练模型的参数 +# model_path = r"F:\ourfl\fedavg\tvhl_client_model_1.pth" +model_path = r"F:\ourfl\fedavg\client_model_3.pth" + +pre_trained_model.load_state_dict(torch.load(model_path)) + +transfer_model = GRU(newconfig.feature_size, newconfig.hidden_size, newconfig.num_layers, newconfig.output_size) # 定义GRU网络 +loss_function = nn.MSELoss() # 定义损失函数 +optimizer_transfer = torch.optim.AdamW(transfer_model.parameters(), lr= newconfig.learning_rate) # 定义优化器 + +# 定义生成随机矩阵的数量 +num_random_matrices = 100 + +# 获取预训练模型和新模型的参数 +pre_trained_params = pre_trained_model.state_dict() +transfer_params = transfer_model.state_dict() # 使用之前定义的模型 model,而不是新的 transfer_model + +# 计算预训练模型参数的 L2 范数 +pre_trained_l2_norm = torch.norm(torch.stack([torch.norm(param) for param in pre_trained_params.values()])) + +# 计算新模型参数的 L2 范数 +transfer_l2_norm = torch.norm(torch.stack([torch.norm(param) for param in transfer_params.values()])) + +# 初始化最小 L2 范数和对应的随机矩阵 +min_l2_norm = float('inf') +min_l2_norm_random_matrix = None + +# 循环生成随机矩阵,并计算 L2 范数 +for i in range(num_random_matrices): + # 生成随机矩阵 + random_matrix = torch.rand(pre_trained_l2_norm.size()) + + # 应用随机矩阵到预训练模型参数 + for name, param in pre_trained_params.items(): + if "gru" in name: + name = name.replace("gru.", "") # 移除模型名称前缀 + if name in transfer_params: + transfer_params[name].copy_(param * random_matrix) + + # 计算使用当前随机矩阵映射后的新模型参数的 L2 范数 + transfer_l2_norm_after_mapping = torch.norm(torch.stack([torch.norm(param) for param in transfer_params.values()])) + + # 更新最小 L2 范数和对应的随机矩阵 + if transfer_l2_norm_after_mapping < min_l2_norm: + min_l2_norm = transfer_l2_norm_after_mapping + min_l2_norm_random_matrix = random_matrix + +# 将最优的随机矩阵应用于预训练模型参数 +for name, param in pre_trained_params.items(): + if "gru" in name: + name = name.replace("gru.", "") # 移除模型名称前缀 + if name in transfer_params: + transfer_params[name].copy_(param * min_l2_norm_random_matrix) + + +# 将已有模型的参数设为不可训练 +for param in pre_trained_model.parameters(): + param.requires_grad = False + + +for epoch in range(num_transfer_epochs): + transfer_model.train() + running_loss = 0 + train_bar = tqdm(train_loader) # 使用之前定义的数据加载器 train_loader + for data in train_bar: + x_train, y_train = data + optimizer_transfer.zero_grad() + y_train_pred = transfer_model(x_train) + y_pretrained_pred = pre_trained_model(x_train) + loss = (1-u)*loss_function(y_train_pred, y_train) + u * loss_function(y_train_pred, y_pretrained_pred) + loss.backward() + optimizer_transfer.step() + + running_loss += loss.item() + train_bar.set_description("Transfer Learning epoch[{}/{}] loss:{:.3f}".format(epoch + 1, num_transfer_epochs, loss.item())) + + + +transfer_model_save_path = './transfer_model3.pth' +torch.save(transfer_model.state_dict(), transfer_model_save_path) + +print('Finished Transfer Learning') + +transfer_model.eval() + +# Combine the training and test data to use the entire dataset +x_combined = np.concatenate([x_train, x_test], axis=0) +y_combined = np.concatenate([y_train, y_test], axis=0) + +# Convert the combined data to tensors +x_combined_tensor = torch.from_numpy(x_combined).to(torch.float32) +y_combined_tensor = torch.from_numpy(y_combined).to(torch.float32) + +# Create a new DataLoader for the combined data +combined_data = TensorDataset(x_combined_tensor, y_combined_tensor) +combined_loader = torch.utils.data.DataLoader(combined_data, batch_size=config.batch_size, shuffle=False) + +with torch.no_grad(): + y_true = [] # Ground truth labels + y_pred = [] # Predicted labels + for data in test_loader: # Using the test data loader + x_test1, y_test1 = data + predictions = transfer_model(x_test1) + y_true.extend(y_test1.numpy().flatten()) + y_pred.extend(predictions.numpy().flatten()) + +# Convert the lists to numpy arrays for easier calculations +y_true = np.array(y_true) +y_pred = np.array(y_pred) + +# Calculate evaluation metrics +mse = mean_squared_error(y_true, y_pred) +rmse = np.sqrt(mse) +mae = mean_absolute_error(y_true, y_pred) + +# Calculate MAPE and SMAPE with checks for division by zero +mape_denominator = np.maximum(np.abs(y_true), 1e-8) # Avoid division by zero +mape = np.mean(np.abs((y_true - y_pred) / mape_denominator)) * 100 + +smape_denominator = np.maximum(np.abs(y_true) + np.abs(y_pred), 1e-8) # Avoid division by zero +smape = 2.0 * np.mean(np.abs(y_pred - y_true) / smape_denominator) * 100 + + +print("Mean Squared Error (MSE):", mse) +print("Root Mean Squared Error (RMSE):", rmse) +print("Mean Absolute Error (MAE):", mae) +print("Mean Absolute Percentage Error (MAPE):", mape) +print("Symmetric Mean Absolute Percentage Error (SMAPE):", smape) + +# Save predictions and metrics to CSV +result_df = pd.DataFrame({'Truth': y_true, 'Predicted': y_pred}) +result_df.to_csv('transfer_learning_predictions3.csv', index=False) + +metrics_df = pd.DataFrame({ + 'MSE': [mse], + 'RMSE': [rmse], + 'MAE': [mae], + 'MAPE': [mape], + 'SMAPE': [smape] +}) +metrics_df.to_csv('transfer_learning_metrics3.csv', index=False) + +print("Transfer learning predictions and metrics saved successfully!") diff --git a/subject2-pre/all-process/ourfl_kd/s1.py b/subject2-pre/all-process/ourfl_kd/s1.py new file mode 100644 index 0000000000000000000000000000000000000000..2f4050ac43539640faddf8c4ee986162f9ca51eb --- /dev/null +++ b/subject2-pre/all-process/ourfl_kd/s1.py @@ -0,0 +1,299 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import StandardScaler, MinMaxScaler +from torch.utils.data import TensorDataset +from tqdm import tqdm +from sklearn.metrics import mean_squared_error, mean_absolute_error + + +data1 = pd.read_csv(r'F:\ourflc\data\processed_hefei.csv') +df = data1[['cpu_use']] +# 2.将数据进行标准化 +scaler = MinMaxScaler(feature_range=(0, 1)) +data = scaler.fit_transform(df.values) + +u = 0.5 +num_transfer_epochs = 30 # 根据需要设置合适的训练轮数 + +class Config(): + + timestep = 2 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 1 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 2 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0001 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +config = Config() + +#新模型 +class newConfig(): + + timestep = 2 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次 + feature_size = 1 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 256 # 隐层大小 + output_size = 1 # 由于是单输出任务,最终输出层大小为1,预测未来1天的avg_mem + num_layers = 4 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.0001 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +newconfig = newConfig() + +# 形成训练数据 +def split_data1(data, timestep, feature_size): + dataX = [] # 保存X + dataY = [] # 保存Y + + # 将整个窗口的数据保存到X中,将未来一天保存到Y中 + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) # 目标列avg_mem为第5列,索引为4 + + dataX = np.array(dataX) + dataY = np.array(dataY) + + return dataX, dataY + +# 形成训练数据 +def split_data(data, timestep, feature_size): + dataX = [] # 保存X + dataY = [] # 保存Y + + # 将整个窗口的数据保存到X中,将未来一天保存到Y中 + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep][0]) # 目标列avg_mem为第5列,索引为4 + + dataX = np.array(dataX) + dataY = np.array(dataY) + + # 获取训练集大小 + train_size = int(np.round(0.6 * dataX.shape[0])) + + # 划分训练集、测试集 + x_train = dataX[: train_size, :] + y_train = dataY[: train_size].reshape(-1, 1) + + x_test = dataX[train_size:, :] + y_test = dataY[train_size:].reshape(-1, 1) + + return [x_train, y_train, x_test, y_test] + +# 定义GRU网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size # 隐层大小 + self.num_layers = num_layers # gru层数 + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + + # 初始化隐层状态 + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + + # GRU运算 + output, h_0 = self.gru(x, h_0) + + # 获取GRU输出的维度信息 + batch_size, timestep, hidden_size = output.shape + + # 将output变成 batch_size * timestep, hidden_dim + output = output.reshape(-1, hidden_size) + + # 全连接层 + output = self.fc(output) + + # 转换维度,用于输出 + output = output.reshape(batch_size, timestep, -1) + + # 返回最后一个时间步的数据 + return output[:, -1, :] + + +# 获取训练数据 +x_train, y_train, x_test, y_test = split_data(data, config.timestep, config.feature_size) + +x_test1, y_test1 = split_data1(data, config.timestep, config.feature_size) + +# 4.将数据转为tensor +x_train_tensor = torch.from_numpy(x_train).to(torch.float32) +y_train_tensor = torch.from_numpy(y_train).to(torch.float32) +x_test_tensor = torch.from_numpy(x_test).to(torch.float32) +y_test_tensor = torch.from_numpy(y_test).to(torch.float32) + +# 5.形成训练数据集 +train_data = TensorDataset(x_train_tensor, y_train_tensor) +test_data = TensorDataset(x_test_tensor, y_test_tensor) + +# 6.将数据加载成迭代器 +train_loader = torch.utils.data.DataLoader(train_data, + config.batch_size, + shuffle=False) + +test_loader = torch.utils.data.DataLoader(test_data, + config.batch_size, + shuffle=False) + +# 假设您的模型类是GRU,config中包含相关的模型配置信息 +pre_trained_model = GRU(config.feature_size, config.hidden_size, config.num_layers, config.output_size) + +# 加载预训练模型的参数 +# model_path = r"F:\ourfl\fedavg\tvhl_client_model_1.pth" +model_path = r"F:\ourfl\fedavg\client_model_4.pth" + +pre_trained_model.load_state_dict(torch.load(model_path)) + +transfer_model = GRU(newconfig.feature_size, newconfig.hidden_size, newconfig.num_layers, newconfig.output_size) # 定义GRU网络 +loss_function = nn.MSELoss() # 定义损失函数 +optimizer_transfer = torch.optim.AdamW(transfer_model.parameters(), lr= newconfig.learning_rate) # 定义优化器 + +# 定义生成随机矩阵的数量 +num_random_matrices = 100 + +# 获取预训练模型和新模型的参数 +pre_trained_params = pre_trained_model.state_dict() +transfer_params = transfer_model.state_dict() # 使用之前定义的模型 model,而不是新的 transfer_model + +# 计算预训练模型参数的 L2 范数 +pre_trained_l2_norm = torch.norm(torch.stack([torch.norm(param) for param in pre_trained_params.values()])) + +# 计算新模型参数的 L2 范数 +transfer_l2_norm = torch.norm(torch.stack([torch.norm(param) for param in transfer_params.values()])) + +# 初始化最小 L2 范数和对应的随机矩阵 +min_l2_norm = float('inf') +min_l2_norm_random_matrix = None + +# 循环生成随机矩阵,并计算 L2 范数 +for i in range(num_random_matrices): + # 生成随机矩阵 + random_matrix = torch.rand(pre_trained_l2_norm.size()) + + # 应用随机矩阵到预训练模型参数 + for name, param in pre_trained_params.items(): + if "gru" in name: + name = name.replace("gru.", "") # 移除模型名称前缀 + if name in transfer_params: + transfer_params[name].copy_(param * random_matrix) + + # 计算使用当前随机矩阵映射后的新模型参数的 L2 范数 + transfer_l2_norm_after_mapping = torch.norm(torch.stack([torch.norm(param) for param in transfer_params.values()])) + + # 更新最小 L2 范数和对应的随机矩阵 + if transfer_l2_norm_after_mapping < min_l2_norm: + min_l2_norm = transfer_l2_norm_after_mapping + min_l2_norm_random_matrix = random_matrix + +# 将最优的随机矩阵应用于预训练模型参数 +for name, param in pre_trained_params.items(): + if "gru" in name: + name = name.replace("gru.", "") # 移除模型名称前缀 + if name in transfer_params: + transfer_params[name].copy_(param * min_l2_norm_random_matrix) + + +# 将已有模型的参数设为不可训练 +for param in pre_trained_model.parameters(): + param.requires_grad = False + + +for epoch in range(num_transfer_epochs): + transfer_model.train() + running_loss = 0 + train_bar = tqdm(train_loader) # 使用之前定义的数据加载器 train_loader + for data in train_bar: + x_train, y_train = data + optimizer_transfer.zero_grad() + y_train_pred = transfer_model(x_train) + y_pretrained_pred = pre_trained_model(x_train) + loss = (1-u)*loss_function(y_train_pred, y_train) + u * loss_function(y_train_pred, y_pretrained_pred) + loss.backward() + optimizer_transfer.step() + + running_loss += loss.item() + train_bar.set_description("Transfer Learning epoch[{}/{}] loss:{:.3f}".format(epoch + 1, num_transfer_epochs, loss.item())) + + + +transfer_model_save_path = './transfer_model4.pth' +torch.save(transfer_model.state_dict(), transfer_model_save_path) + +print('Finished Transfer Learning') + +transfer_model.eval() + +# Combine the training and test data to use the entire dataset +x_combined = np.concatenate([x_train, x_test], axis=0) +y_combined = np.concatenate([y_train, y_test], axis=0) + +# Convert the combined data to tensors +x_combined_tensor = torch.from_numpy(x_combined).to(torch.float32) +y_combined_tensor = torch.from_numpy(y_combined).to(torch.float32) + +# Create a new DataLoader for the combined data +combined_data = TensorDataset(x_combined_tensor, y_combined_tensor) +combined_loader = torch.utils.data.DataLoader(combined_data, batch_size=config.batch_size, shuffle=False) + +with torch.no_grad(): + y_true = [] # Ground truth labels + y_pred = [] # Predicted labels + for data in test_loader: # Using the test data loader + x_test1, y_test1 = data + predictions = transfer_model(x_test1) + y_true.extend(y_test1.numpy().flatten()) + y_pred.extend(predictions.numpy().flatten()) + +# Convert the lists to numpy arrays for easier calculations +y_true = np.array(y_true) +y_pred = np.array(y_pred) + +# Calculate evaluation metrics +mse = mean_squared_error(y_true, y_pred) +rmse = np.sqrt(mse) +mae = mean_absolute_error(y_true, y_pred) + +# Calculate MAPE and SMAPE with checks for division by zero +mape_denominator = np.maximum(np.abs(y_true), 1e-8) # Avoid division by zero +mape = np.mean(np.abs((y_true - y_pred) / mape_denominator)) * 100 + +smape_denominator = np.maximum(np.abs(y_true) + np.abs(y_pred), 1e-8) # Avoid division by zero +smape = 2.0 * np.mean(np.abs(y_pred - y_true) / smape_denominator) * 100 + + +print("Mean Squared Error (MSE):", mse) +print("Root Mean Squared Error (RMSE):", rmse) +print("Mean Absolute Error (MAE):", mae) +print("Mean Absolute Percentage Error (MAPE):", mape) +print("Symmetric Mean Absolute Percentage Error (SMAPE):", smape) + +# Save predictions and metrics to CSV +result_df = pd.DataFrame({'Truth': y_true, 'Predicted': y_pred}) +result_df.to_csv('transfer_learning_predictions4.csv', index=False) + +metrics_df = pd.DataFrame({ + 'MSE': [mse], + 'RMSE': [rmse], + 'MAE': [mae], + 'MAPE': [mape], + 'SMAPE': [smape] +}) +metrics_df.to_csv('transfer_learning_metrics4.csv', index=False) + +print("Transfer learning predictions and metrics saved successfully!") diff --git a/subject2-pre/client.py b/subject2-pre/client.py new file mode 100644 index 0000000000000000000000000000000000000000..37a03f8530e1fea5c5553d90c54e39acf21e3bab --- /dev/null +++ b/subject2-pre/client.py @@ -0,0 +1,185 @@ +import socket +import pickle +import numpy as np +import pandas as pd +import torch.optim as optim + +import torch +import torch.nn as nn +import argparse + +# 定义网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + self.activation = nn.ReLU() # 添加激活函数 + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + output, h_0 = self.gru(x, h_0) + output = self.fc(output[:, -1, :]) + output = self.activation(output) # 添加激活函数 + return output, h_0 + +# 定义学习任务的设置 +def parser_args(): + parser = argparse.ArgumentParser(description='GRU Model Parameters') + parser.add_argument('--timestep', type=int, default=4, help='Time step size') + parser.add_argument('--batch_size', type=int, default=32, help='Batch size') + parser.add_argument('--feature_size', type=int, default=6, help='Number of features') + parser.add_argument('--hidden_size', type=int, default=512, help='Size of the hidden layer') + parser.add_argument('--output_size', type=int, default=3, help='Size of the output layer') + parser.add_argument('--num_layers', type=int, default=2, help='Number of GRU layers') + parser.add_argument('--epochs', type=int, default=3, help='Number of training epochs') + parser.add_argument('--learning_rate', type=float, default=0.0001, help='Learning rate') + parser.add_argument('--model_name', type=str, default='gru', help='Name of the model') + parser.add_argument('--save_path', type=str, default='./{}.pth'.format('gru_a'), help='Path to save the best model') + parser.add_argument('--no_cuda', action='store_true', default=False, help='Disable CUDA') + args = parser.parse_args() + return args + +def train_model(model, x_train, y_train, config): + # 设置优化器和损失函数 + optimizer = optim.Adam(model.parameters(), lr=config.learning_rate) + # optimizer = optim.SGD(model.parameters(), lr=config.learning_rate) + criterion = nn.MSELoss() + # 设置初始隐藏状态(可根据需要调整) + # hidden = None + + # 进行本地训练 + for epoch in range(config.epochs): + optimizer.zero_grad() + + # 手动清除隐藏状态 + hidden = None + + # 前向传播 + output, hidden = model(x_train, hidden) + loss = criterion(output, y_train) + + # 反向传播和参数更新 + loss.backward(retain_graph=True) + optimizer.step() + + print(f"Epoch [{epoch + 1}/{config.epochs}], Loss: {loss.item()}") + + # 返回更新后的客户端模型参数 + return model + +# 划分数据 +def split_data(data, timestep,feature_size, target_size): + dataX = [] + dataY = [] + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep, :target_size]) + dataX = np.array(dataX) + dataY = np.array(dataY) + train_size = int(np.round(dataX.shape[0])) + x_train = dataX[:train_size, :] + y_train = dataY[:train_size, :] + x_test = dataX[train_size:, :] + y_test = dataY[train_size:, :] + + return x_train, y_train, x_test, y_test + + +# 数据加载 +def load_data(data_path): + dataset = pd.read_csv(data_path) + return dataset + +args = parser_args() + +data_path ='tvhl_a.csv' +dataset = load_data(data_path) + +selected_features = ['cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan', 'mem_plan'] +target_size = 3 # 三个目标列的大小 +selected_data = dataset[selected_features].values + +x_train,y_train, x_test, y_test = split_data(selected_data, args.timestep, args.feature_size,target_size) +x_train_tensor = torch.Tensor(x_train) +y_train_tensor = torch.Tensor(y_train) + +# 开始训练 +model = GRU(args.feature_size, args.hidden_size, args.num_layers, args.output_size).to('cpu') +# 创建优化器 +optimizer = optim.SGD(model.parameters(), lr=args.learning_rate) + +# 创建TCP客户端套接字 +client = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + +# 连接到服务器 +# server_address = ('127.0.0.1', 4321) +server_address = ('202.117.43.11', 8080) +client.connect(server_address) + +ROUNDS = 20 +length = len(pickle.dumps(model.state_dict())) +chunk_size = 1024 + +for round in range(ROUNDS): + print("第 %d 轮" % (round), end='\n') + + size_message = client.recv(4096) + length = int.from_bytes(size_message, byteorder='big') + + # 从服务器接收模型权重 + received_weights = bytearray() + + while len(received_weights) < length: + # print('从服务器接收模型权重') + chunk = client.recv(4096) + received_weights += chunk + + print('全部接受完毕') + if len(received_weights) == length: + try: + received_weights = pickle.loads(received_weights) + print('解析成功') + # 设置模型权重 + model.load_state_dict(received_weights) + except EOFError: + print('解析失败: EOFError') + else: + print('接收到的数据长度不符合预期,可能存在问题') + + + # 在客户端本地训练模型 + print("开始训练") + train_model(model, x_train_tensor, y_train_tensor, args) + + # 获取更新后的模型权重 + updated_weights = model.state_dict() + print('获取更新后的模型权重') + # 序列化并将更新后的权重发送到服务器 + print('将更新后的权重发送到服务器') + updated_weights_serialized = pickle.dumps(updated_weights) + print(f"Serialized client model weights size: {len(updated_weights_serialized)} bytes") + + # size_message = str(len(updated_weights_serialized)).encode('utf-8') + # client.send(size_message) + size_message = len( updated_weights_serialized).to_bytes(4096, byteorder='big') + client.send(size_message) + + total_sent = 0 + for i in range(0, len(updated_weights_serialized), chunk_size): + chunk =updated_weights_serialized [i:i + chunk_size] + sent = client.send(chunk) + total_sent += sent + + + print('发送完毕') + +# 关闭客户端套接字 +client.close() +torch.save(model, args.save_path) \ No newline at end of file diff --git a/subject2-pre/environment.md b/subject2-pre/environment.md new file mode 100644 index 0000000000000000000000000000000000000000..17d56bf6bebbf903bf510951eadf673f4ef0a8ed --- /dev/null +++ b/subject2-pre/environment.md @@ -0,0 +1,478 @@ +astropy 5.1 +asttokens 2.0.5 +astunparse 1.6.3 +atomicwrites 1.4.0 +attrs 22.1.0 +automat 20.2.0 +autopep8 1.6.0 +babel 2.11.0 +backcall 0.2.0 +backports 1.1 +backports.functools_lru_cache 1.6.4 +backports.tempfile 1.0 +backports.weakref 1.0.post1 +bcrypt 3.2.0 +beautifulsoup4 4.11.1 +binaryornot 0.4.4 +black 22.6.0 +blas 1.0 +bleach 4.1.0 +blosc 1.21.3 +bokeh 2.4.3 +boltons 23.0.0 +bottleneck 1.3.5 +brotli 1.0.9 +brotli-bin 1.0.9 +brotlipy 0.7.0 +bs4 0.0.1 +bzip2 1.0.8 +ca-certificates 2023.5.7 +cachetools 5.3.0 +certifi 2023.5.7 +cffi 1.15.1 +cfitsio 3.470 +chardet 4.0.0 +charls 2.2.0 +charset-normalizer 2.0.4 +click 8.0.4 +cloudpickle 2.0.0 +clyent 1.2.1 +colorama 0.4.6 +colorcet 3.0.1 +comm 0.1.2 +conda 23.3.1 +conda-build 3.24.0 +conda-content-trust 0.1.3 +conda-pack 0.6.0 +conda-package-handling 2.0.2 +conda-package-streaming 0.7.0 +conda-repo-cli 1.0.41 +conda-token 0.4.0 +conda-verify 3.4.2 +console_shortcut 0.1.1 +constantly 15.1.0 +contourpy 1.0.5 +cookiecutter 1.7.3 +cryptography 39.0.1 +cssselect 1.1.0 +cuda-cccl 12.1.109 +cuda-cudart 11.7.99 +cuda-cudart-dev 11.7.99 +cuda-cupti 11.7.101 +cuda-libraries 11.7.1 +cuda-libraries-dev 11.7.1 +cuda-nvrtc 11.7.99 +cuda-nvrtc-dev 11.7.99 +cuda-nvtx 11.7.91 +cuda-runtime 11.7.1 +curl 7.87.0 +cycler 0.11.0 +cytoolz 0.12.0 +daal4py 2023.0.2 +dal 2023.0.1 +dask 2022.7.0 +dask-core 2022.7.0 +datashader 0.14.4 +datashape 0.5.4 +debugpy 1.5.1 +decorator 5.1.1 +defusedxml 0.7.1 +diff-match-patch 20200713 +dill 0.3.6 +distributed 2022.7.0 +dm-tree 0.1.8 +docstring-to-markdown 0.11 +docutils 0.18.1 +easydict 1.10 +entrypoints 0.4 +et_xmlfile 1.1.0 +executing 0.8.3 +filelock 3.9.0 +flake8 6.0.0 +flask 2.2.2 +flatbuffers 23.5.9 +flit-core 3.6.0 +fonttools 4.25.0 +freetype 2.12.1 +fsspec 2022.11.0 +fst-pso 1.8.1 +future 0.18.3 +fuzzytm 2.0.5 +gast 0.4.0 +gensim 4.3.0 +giflib 5.2.1 +glib 2.69.1 +glob2 0.7 +google-auth 2.18.1 +google-auth-oauthlib 1.0.0 +google-pasta 0.2.0 +greenlet 2.0.1 +grpcio 1.54.2 +gst-plugins-base 1.18.5 +gstreamer 1.18.5 +h5py 3.7.0 +hdf5 1.10.6 +heapdict 1.0.1 +holoviews 1.15.4 +huggingface_hub 0.10.1 +hvplot 0.8.2 +hyperlink 21.0.0 +icc_rt 2022.1.0 +icu 58.2 +idna 3.4 +imagecodecs 2021.8.26 +imageio 2.26.0 +imagesize 1.4.1 +imbalanced-learn 0.10.1 +importlib-metadata 4.11.3 +importlib_metadata 4.11.3 +incremental 21.3.0 +inflection 0.5.1 +iniconfig 1.1.1 +intake 0.6.7 +intel-openmp 2021.4.0 +intervaltree 3.1.0 +ipykernel 6.19.2 +ipython 8.10.0 +ipython_genutils 0.2.0 +ipywidgets 7.6.5 +isort 5.9.3 +itemadapter 0.3.0 +itemloaders 1.0.4 +itsdangerous 2.0.1 +jax 0.4.10 +jedi 0.18.1 +jellyfish 0.9.0 +jinja2 3.1.2 +jinja2-time 0.2.0 +jmespath 0.10.0 +joblib 1.1.1 +jpeg 9e +jq 1.6 +json5 0.9.6 +jsonpatch 1.32 +jsonpointer 2.1 +jsonschema 4.17.3 +jupyter 1.0.0 +jupyter_client 7.3.4 +jupyter_console 6.6.2 +jupyter_core 5.2.0 +jupyter_server 1.23.4 +jupyterlab 3.5.3 +jupyterlab_pygments 0.1.2 +jupyterlab_server 2.19.0 +jupyterlab_widgets 1.0.0 +jxrlib 1.1 +keras 2.12.0 +keyring 23.4.0 +kiwisolver 1.4.4 +lazy-object-proxy 1.6.0 +lcms2 2.12 +lerc 3.0 +libaec 1.0.4 +libarchive 3.6.2 +libbrotlicommon 1.0.9 +libbrotlidec 1.0.9 +libbrotlienc 1.0.9 +libclang 16.0.0 +libcublas 11.10.3.66 +libcublas-dev 11.10.3.66 +libcufft 10.7.2.124 +libcufft-dev 10.7.2.124 +libcurand 10.3.2.106 +libcurand-dev 10.3.2.106 +libcurl 7.87.0 +libcusolver 11.4.0.1 +libcusolver-dev 11.4.0.1 +libcusparse 11.7.4.91 +libcusparse-dev 11.7.4.91 +libdeflate 1.17 +libffi 3.4.2 +libiconv 1.16 +liblief 0.12.3 +libnpp 11.7.4.75 +libnpp-dev 11.7.4.75 +libnvjpeg 11.8.0.2 +libnvjpeg-dev 11.8.0.2 +libogg 1.3.5 +libpng 1.6.39 +libsodium 1.0.18 +libspatialindex 1.9.3 +libssh2 1.10.0 +libtiff 4.5.0 +libuv 1.44.2 +libvorbis 1.3.7 +libwebp 1.2.4 +libwebp-base 1.2.4 +libxml2 2.9.14 +libxslt 1.1.35 +libzopfli 1.0.3 +llvmlite 0.39.1 +locket 1.0.0 +lxml 4.9.1 +lz4 3.1.3 +lz4-c 1.9.4 +lzo 2.10 +m2-msys2-runtime 2.5.0.17080.65c939c +m2-patch 2.7.5 +m2w64-libwinpthread-git 5.0.0.4634.697f757 +markdown 3.4.1 +markupsafe 2.1.1 +matplotlib 3.7.0 +matplotlib-base 3.7.0 +matplotlib-inline 0.1.6 +matplotlib-venn 0.11.9 +mccabe 0.7.0 +menuinst 1.4.19 +miniful 0.0.6 +mistune 0.8.4 +mkl 2021.4.0 +mkl-service 2.4.0 +mkl_fft 1.3.1 +mkl_random 1.2.2 +ml-dtypes 0.1.0 +mock 4.0.3 +mpmath 1.2.1 +msgpack-python 1.0.3 +msys2-conda-epoch 20160418 +multipledispatch 0.6.0 +munkres 1.1.4 +mypy_extensions 0.4.3 +navigator-updater 0.3.0 +nbclassic 0.5.2 +nbclient 0.5.13 +nbconvert 6.5.4 +nbformat 5.4.0 +nest-asyncio 1.5.6 +networkx 2.8.4 +ninja 1.10.2 +ninja-base 1.10.2 +nltk 3.7 +notebook 6.5.2 +notebook-shim 0.2.2 +numba 0.56.4 +numexpr 2.8.4 +numpy 1.23.0 +numpydoc 1.5.0 +oauthlib 3.2.2 +openjpeg 2.4.0 +openpyxl 3.0.10 +openssl 1.1.1t +opt-einsum 3.3.0 +packaging 22.0 +pandarallel 1.6.5 +pandas 1.5.3 +pandocfilters 1.5.0 +panel 0.14.3 +param 1.12.3 +paramiko 2.8.1 +parsel 1.6.0 +parso 0.8.3 +partd 1.2.0 +pathlib 1.0.1 +pathspec 0.10.3 +patsy 0.5.3 +pcre 8.45 +pep8 1.7.1 +pexpect 4.8.0 +pickleshare 0.7.5 +pillow 9.4.0 +pip 22.3.1 +pkginfo 1.9.6 +platformdirs 2.5.2 +plotly 5.9.0 +pluggy 1.0.0 +ply 3.11 +pooch 1.4.0 +powershell_shortcut 0.0.1 +poyo 0.5.0 +prometheus_client 0.14.1 +prompt-toolkit 3.0.36 +prompt_toolkit 3.0.36 +protego 0.1.16 +protobuf 4.23.1 +psutil 5.9.0 +ptyprocess 0.7.0 +pure_eval 0.2.2 +py 1.11.0 +py-lief 0.12.3 +pyasn1 0.4.8 +pyasn1-modules 0.2.8 +pycodestyle 2.10.0 +pycosat 0.6.4 +pycparser 2.21 +pyct 0.5.0 +pycurl 7.45.1 +pydispatcher 2.0.5 +pydocstyle 6.3.0 +pyerfa 2.0.0 +pyflakes 3.0.1 +pyfume 0.2.25 +pygments 2.11.2 +pyhamcrest 2.0.2 +pyjwt 2.4.0 +pylint 2.16.2 +pylint-venv 2.3.0 +pyls-spyder 0.4.0 +pynacl 1.5.0 +pyodbc 4.0.34 +pyopenssl 23.0.0 +pyparsing 3.0.9 +pyqt 5.15.7 +pyqt5-sip 12.11.0 +pyqtwebengine 5.15.7 +pyrsistent 0.18.0 +pysocks 1.7.1 +pytables 3.7.0 +pytest 6.2.5 +python 3.10.9 +python-dateutil 2.8.2 +python-fastjsonschema 2.16.2 +python-libarchive-c 2.9 +python-lsp-black 1.2.1 +python-lsp-jsonrpc 1.0.0 +python-lsp-server 1.7.1 +python-slugify 5.0.2 +python-snappy 0.6.1 +pytoolconfig 1.2.5 +pytorch 2.0.1 +pytorch-cuda 11.7 +pytorch-mutex 1.0 +pytz 2022.7 +pyviz_comms 2.0.2 +pywavelets 1.4.1 +pywin32 305 +pywin32-ctypes 0.2.0 +pywinpty 2.0.10 +pyyaml 6.0 +pyzmq 23.2.0 +qdarkstyle 3.0.2 +qstylizer 0.2.2 +qt-main 5.15.2 +qt-webengine 5.15.9 +qtawesome 1.2.2 +qtconsole 5.4.0 +qtpy 2.2.0 +qtwebkit 5.212 +queuelib 1.5.0 +regex 2022.7.9 +requests 2.28.1 +requests-file 1.5.1 +requests-mock 1.11.0 +requests-oauthlib 1.3.1 +requests-toolbelt 0.9.1 +rope 1.7.0 +rsa 4.9 +rtree 1.0.1 +ruamel.yaml 0.17.21 +ruamel.yaml.clib 0.2.6 +ruamel_yaml 0.17.21 +scikit-image 0.19.3 +scikit-learn 1.2.1 +scikit-learn-intelex 2023.0.2 +scipy 1.10.0 +scrapy 2.8.0 +seaborn 0.12.2 +send2trash 1.8.0 +service_identity 18.1.0 +setuptools 68.0.0 +simpful 2.11.0 +simplejson 3.19.1 +sip 6.6.2 +six 1.16.0 +smart_open 5.2.1 +snappy 1.1.9 +sniffio 1.2.0 +snowballstemmer 2.2.0 +sortedcontainers 2.4.0 +soupsieve 2.3.2.post1 +sphinx 5.0.2 +sphinxcontrib-applehelp 1.0.2 +sphinxcontrib-devhelp 1.0.2 +sphinxcontrib-htmlhelp 2.0.0 +sphinxcontrib-jsmath 1.0.1 +sphinxcontrib-qthelp 1.0.3 +sphinxcontrib-serializinghtml 1.1.5 +spyder 5.4.1 +spyder-kernels 2.4.1 +sqlalchemy 1.4.39 +sqlite 3.40.1 +stack_data 0.2.0 +statsmodels 0.13.5 +supervenn 0.4.1 +sympy 1.11.1 +tabulate 0.8.10 +tbb 2021.7.0 +tbb4py 2021.7.0 +tblib 1.7.0 +tenacity 8.0.1 +tensorboard 2.12.3 +tensorboard-data-server 0.6.1 +tensorboard-plugin-wit 1.8.1 +tensorflow 2.12.0 +tensorflow-estimator 2.11.0 +tensorflow-intel 2.12.0 +tensorflow-io-gcs-filesystem 0.31.0 +tensorflow-probability 0.19.0 +termcolor 2.3.0 +terminado 0.17.1 +text-unidecode 1.3 +textdistance 4.2.1 +threadpoolctl 2.2.0 +three-merge 0.1.1 +tifffile 2021.7.2 +tinycss2 1.2.1 +tk 8.6.12 +tldextract 3.2.0 +tokenizers 0.11.4 +toml 0.10.2 +tomli 2.0.1 +tomlkit 0.11.1 +toolz 0.12.0 +torchaudio 2.0.2 +torchvision 0.15.2 +tornado 6.1 +tqdm 4.64.1 +traitlets 5.7.1 +transformers 4.24.0 +tushare 1.2.89 +twisted 22.2.0 +twisted-iocpsupport 1.0.2 +typeguard 2.13.3 +typing-extensions 4.4.0 +typing_extensions 4.4.0 +tzdata 2023.3 +ujson 5.4.0 +unidecode 1.2.0 +upsetplot 0.8.0 +urllib3 1.26.14 +utils 1.0.1 +vc 14.2 +vs2015_runtime 14.27.29016 +w3lib 1.21.0 +watchdog 2.1.6 +wcwidth 0.2.5 +webencodings 0.5.1 +websocket-client 0.57.0 +werkzeug 2.2.2 +whatthepatch 1.0.2 +wheel 0.40.0 +widgetsnbextension 3.5.2 +win_inet_pton 1.1.0 +wincertstore 0.2 +winpty 0.4.3 +wrapt 1.14.1 +xarray 2022.11.0 +xlwings 0.29.1 +xz 5.2.10 +yaml 0.2.5 +yapf 0.31.0 +zeromq 4.3.4 +zfp 0.5.5 +zict 2.1.0 +zipp 3.11.0 +zlib 1.2.13 +zope 1.0 +zope.interface 5.4.0 +zstandard 0.19.0 +zstd 1.5.2 diff --git a/subject2-pre/environment.yaml b/subject2-pre/environment.yaml new file mode 100644 index 0000000000000000000000000000000000000000..17d56bf6bebbf903bf510951eadf673f4ef0a8ed --- /dev/null +++ b/subject2-pre/environment.yaml @@ -0,0 +1,478 @@ +astropy 5.1 +asttokens 2.0.5 +astunparse 1.6.3 +atomicwrites 1.4.0 +attrs 22.1.0 +automat 20.2.0 +autopep8 1.6.0 +babel 2.11.0 +backcall 0.2.0 +backports 1.1 +backports.functools_lru_cache 1.6.4 +backports.tempfile 1.0 +backports.weakref 1.0.post1 +bcrypt 3.2.0 +beautifulsoup4 4.11.1 +binaryornot 0.4.4 +black 22.6.0 +blas 1.0 +bleach 4.1.0 +blosc 1.21.3 +bokeh 2.4.3 +boltons 23.0.0 +bottleneck 1.3.5 +brotli 1.0.9 +brotli-bin 1.0.9 +brotlipy 0.7.0 +bs4 0.0.1 +bzip2 1.0.8 +ca-certificates 2023.5.7 +cachetools 5.3.0 +certifi 2023.5.7 +cffi 1.15.1 +cfitsio 3.470 +chardet 4.0.0 +charls 2.2.0 +charset-normalizer 2.0.4 +click 8.0.4 +cloudpickle 2.0.0 +clyent 1.2.1 +colorama 0.4.6 +colorcet 3.0.1 +comm 0.1.2 +conda 23.3.1 +conda-build 3.24.0 +conda-content-trust 0.1.3 +conda-pack 0.6.0 +conda-package-handling 2.0.2 +conda-package-streaming 0.7.0 +conda-repo-cli 1.0.41 +conda-token 0.4.0 +conda-verify 3.4.2 +console_shortcut 0.1.1 +constantly 15.1.0 +contourpy 1.0.5 +cookiecutter 1.7.3 +cryptography 39.0.1 +cssselect 1.1.0 +cuda-cccl 12.1.109 +cuda-cudart 11.7.99 +cuda-cudart-dev 11.7.99 +cuda-cupti 11.7.101 +cuda-libraries 11.7.1 +cuda-libraries-dev 11.7.1 +cuda-nvrtc 11.7.99 +cuda-nvrtc-dev 11.7.99 +cuda-nvtx 11.7.91 +cuda-runtime 11.7.1 +curl 7.87.0 +cycler 0.11.0 +cytoolz 0.12.0 +daal4py 2023.0.2 +dal 2023.0.1 +dask 2022.7.0 +dask-core 2022.7.0 +datashader 0.14.4 +datashape 0.5.4 +debugpy 1.5.1 +decorator 5.1.1 +defusedxml 0.7.1 +diff-match-patch 20200713 +dill 0.3.6 +distributed 2022.7.0 +dm-tree 0.1.8 +docstring-to-markdown 0.11 +docutils 0.18.1 +easydict 1.10 +entrypoints 0.4 +et_xmlfile 1.1.0 +executing 0.8.3 +filelock 3.9.0 +flake8 6.0.0 +flask 2.2.2 +flatbuffers 23.5.9 +flit-core 3.6.0 +fonttools 4.25.0 +freetype 2.12.1 +fsspec 2022.11.0 +fst-pso 1.8.1 +future 0.18.3 +fuzzytm 2.0.5 +gast 0.4.0 +gensim 4.3.0 +giflib 5.2.1 +glib 2.69.1 +glob2 0.7 +google-auth 2.18.1 +google-auth-oauthlib 1.0.0 +google-pasta 0.2.0 +greenlet 2.0.1 +grpcio 1.54.2 +gst-plugins-base 1.18.5 +gstreamer 1.18.5 +h5py 3.7.0 +hdf5 1.10.6 +heapdict 1.0.1 +holoviews 1.15.4 +huggingface_hub 0.10.1 +hvplot 0.8.2 +hyperlink 21.0.0 +icc_rt 2022.1.0 +icu 58.2 +idna 3.4 +imagecodecs 2021.8.26 +imageio 2.26.0 +imagesize 1.4.1 +imbalanced-learn 0.10.1 +importlib-metadata 4.11.3 +importlib_metadata 4.11.3 +incremental 21.3.0 +inflection 0.5.1 +iniconfig 1.1.1 +intake 0.6.7 +intel-openmp 2021.4.0 +intervaltree 3.1.0 +ipykernel 6.19.2 +ipython 8.10.0 +ipython_genutils 0.2.0 +ipywidgets 7.6.5 +isort 5.9.3 +itemadapter 0.3.0 +itemloaders 1.0.4 +itsdangerous 2.0.1 +jax 0.4.10 +jedi 0.18.1 +jellyfish 0.9.0 +jinja2 3.1.2 +jinja2-time 0.2.0 +jmespath 0.10.0 +joblib 1.1.1 +jpeg 9e +jq 1.6 +json5 0.9.6 +jsonpatch 1.32 +jsonpointer 2.1 +jsonschema 4.17.3 +jupyter 1.0.0 +jupyter_client 7.3.4 +jupyter_console 6.6.2 +jupyter_core 5.2.0 +jupyter_server 1.23.4 +jupyterlab 3.5.3 +jupyterlab_pygments 0.1.2 +jupyterlab_server 2.19.0 +jupyterlab_widgets 1.0.0 +jxrlib 1.1 +keras 2.12.0 +keyring 23.4.0 +kiwisolver 1.4.4 +lazy-object-proxy 1.6.0 +lcms2 2.12 +lerc 3.0 +libaec 1.0.4 +libarchive 3.6.2 +libbrotlicommon 1.0.9 +libbrotlidec 1.0.9 +libbrotlienc 1.0.9 +libclang 16.0.0 +libcublas 11.10.3.66 +libcublas-dev 11.10.3.66 +libcufft 10.7.2.124 +libcufft-dev 10.7.2.124 +libcurand 10.3.2.106 +libcurand-dev 10.3.2.106 +libcurl 7.87.0 +libcusolver 11.4.0.1 +libcusolver-dev 11.4.0.1 +libcusparse 11.7.4.91 +libcusparse-dev 11.7.4.91 +libdeflate 1.17 +libffi 3.4.2 +libiconv 1.16 +liblief 0.12.3 +libnpp 11.7.4.75 +libnpp-dev 11.7.4.75 +libnvjpeg 11.8.0.2 +libnvjpeg-dev 11.8.0.2 +libogg 1.3.5 +libpng 1.6.39 +libsodium 1.0.18 +libspatialindex 1.9.3 +libssh2 1.10.0 +libtiff 4.5.0 +libuv 1.44.2 +libvorbis 1.3.7 +libwebp 1.2.4 +libwebp-base 1.2.4 +libxml2 2.9.14 +libxslt 1.1.35 +libzopfli 1.0.3 +llvmlite 0.39.1 +locket 1.0.0 +lxml 4.9.1 +lz4 3.1.3 +lz4-c 1.9.4 +lzo 2.10 +m2-msys2-runtime 2.5.0.17080.65c939c +m2-patch 2.7.5 +m2w64-libwinpthread-git 5.0.0.4634.697f757 +markdown 3.4.1 +markupsafe 2.1.1 +matplotlib 3.7.0 +matplotlib-base 3.7.0 +matplotlib-inline 0.1.6 +matplotlib-venn 0.11.9 +mccabe 0.7.0 +menuinst 1.4.19 +miniful 0.0.6 +mistune 0.8.4 +mkl 2021.4.0 +mkl-service 2.4.0 +mkl_fft 1.3.1 +mkl_random 1.2.2 +ml-dtypes 0.1.0 +mock 4.0.3 +mpmath 1.2.1 +msgpack-python 1.0.3 +msys2-conda-epoch 20160418 +multipledispatch 0.6.0 +munkres 1.1.4 +mypy_extensions 0.4.3 +navigator-updater 0.3.0 +nbclassic 0.5.2 +nbclient 0.5.13 +nbconvert 6.5.4 +nbformat 5.4.0 +nest-asyncio 1.5.6 +networkx 2.8.4 +ninja 1.10.2 +ninja-base 1.10.2 +nltk 3.7 +notebook 6.5.2 +notebook-shim 0.2.2 +numba 0.56.4 +numexpr 2.8.4 +numpy 1.23.0 +numpydoc 1.5.0 +oauthlib 3.2.2 +openjpeg 2.4.0 +openpyxl 3.0.10 +openssl 1.1.1t +opt-einsum 3.3.0 +packaging 22.0 +pandarallel 1.6.5 +pandas 1.5.3 +pandocfilters 1.5.0 +panel 0.14.3 +param 1.12.3 +paramiko 2.8.1 +parsel 1.6.0 +parso 0.8.3 +partd 1.2.0 +pathlib 1.0.1 +pathspec 0.10.3 +patsy 0.5.3 +pcre 8.45 +pep8 1.7.1 +pexpect 4.8.0 +pickleshare 0.7.5 +pillow 9.4.0 +pip 22.3.1 +pkginfo 1.9.6 +platformdirs 2.5.2 +plotly 5.9.0 +pluggy 1.0.0 +ply 3.11 +pooch 1.4.0 +powershell_shortcut 0.0.1 +poyo 0.5.0 +prometheus_client 0.14.1 +prompt-toolkit 3.0.36 +prompt_toolkit 3.0.36 +protego 0.1.16 +protobuf 4.23.1 +psutil 5.9.0 +ptyprocess 0.7.0 +pure_eval 0.2.2 +py 1.11.0 +py-lief 0.12.3 +pyasn1 0.4.8 +pyasn1-modules 0.2.8 +pycodestyle 2.10.0 +pycosat 0.6.4 +pycparser 2.21 +pyct 0.5.0 +pycurl 7.45.1 +pydispatcher 2.0.5 +pydocstyle 6.3.0 +pyerfa 2.0.0 +pyflakes 3.0.1 +pyfume 0.2.25 +pygments 2.11.2 +pyhamcrest 2.0.2 +pyjwt 2.4.0 +pylint 2.16.2 +pylint-venv 2.3.0 +pyls-spyder 0.4.0 +pynacl 1.5.0 +pyodbc 4.0.34 +pyopenssl 23.0.0 +pyparsing 3.0.9 +pyqt 5.15.7 +pyqt5-sip 12.11.0 +pyqtwebengine 5.15.7 +pyrsistent 0.18.0 +pysocks 1.7.1 +pytables 3.7.0 +pytest 6.2.5 +python 3.10.9 +python-dateutil 2.8.2 +python-fastjsonschema 2.16.2 +python-libarchive-c 2.9 +python-lsp-black 1.2.1 +python-lsp-jsonrpc 1.0.0 +python-lsp-server 1.7.1 +python-slugify 5.0.2 +python-snappy 0.6.1 +pytoolconfig 1.2.5 +pytorch 2.0.1 +pytorch-cuda 11.7 +pytorch-mutex 1.0 +pytz 2022.7 +pyviz_comms 2.0.2 +pywavelets 1.4.1 +pywin32 305 +pywin32-ctypes 0.2.0 +pywinpty 2.0.10 +pyyaml 6.0 +pyzmq 23.2.0 +qdarkstyle 3.0.2 +qstylizer 0.2.2 +qt-main 5.15.2 +qt-webengine 5.15.9 +qtawesome 1.2.2 +qtconsole 5.4.0 +qtpy 2.2.0 +qtwebkit 5.212 +queuelib 1.5.0 +regex 2022.7.9 +requests 2.28.1 +requests-file 1.5.1 +requests-mock 1.11.0 +requests-oauthlib 1.3.1 +requests-toolbelt 0.9.1 +rope 1.7.0 +rsa 4.9 +rtree 1.0.1 +ruamel.yaml 0.17.21 +ruamel.yaml.clib 0.2.6 +ruamel_yaml 0.17.21 +scikit-image 0.19.3 +scikit-learn 1.2.1 +scikit-learn-intelex 2023.0.2 +scipy 1.10.0 +scrapy 2.8.0 +seaborn 0.12.2 +send2trash 1.8.0 +service_identity 18.1.0 +setuptools 68.0.0 +simpful 2.11.0 +simplejson 3.19.1 +sip 6.6.2 +six 1.16.0 +smart_open 5.2.1 +snappy 1.1.9 +sniffio 1.2.0 +snowballstemmer 2.2.0 +sortedcontainers 2.4.0 +soupsieve 2.3.2.post1 +sphinx 5.0.2 +sphinxcontrib-applehelp 1.0.2 +sphinxcontrib-devhelp 1.0.2 +sphinxcontrib-htmlhelp 2.0.0 +sphinxcontrib-jsmath 1.0.1 +sphinxcontrib-qthelp 1.0.3 +sphinxcontrib-serializinghtml 1.1.5 +spyder 5.4.1 +spyder-kernels 2.4.1 +sqlalchemy 1.4.39 +sqlite 3.40.1 +stack_data 0.2.0 +statsmodels 0.13.5 +supervenn 0.4.1 +sympy 1.11.1 +tabulate 0.8.10 +tbb 2021.7.0 +tbb4py 2021.7.0 +tblib 1.7.0 +tenacity 8.0.1 +tensorboard 2.12.3 +tensorboard-data-server 0.6.1 +tensorboard-plugin-wit 1.8.1 +tensorflow 2.12.0 +tensorflow-estimator 2.11.0 +tensorflow-intel 2.12.0 +tensorflow-io-gcs-filesystem 0.31.0 +tensorflow-probability 0.19.0 +termcolor 2.3.0 +terminado 0.17.1 +text-unidecode 1.3 +textdistance 4.2.1 +threadpoolctl 2.2.0 +three-merge 0.1.1 +tifffile 2021.7.2 +tinycss2 1.2.1 +tk 8.6.12 +tldextract 3.2.0 +tokenizers 0.11.4 +toml 0.10.2 +tomli 2.0.1 +tomlkit 0.11.1 +toolz 0.12.0 +torchaudio 2.0.2 +torchvision 0.15.2 +tornado 6.1 +tqdm 4.64.1 +traitlets 5.7.1 +transformers 4.24.0 +tushare 1.2.89 +twisted 22.2.0 +twisted-iocpsupport 1.0.2 +typeguard 2.13.3 +typing-extensions 4.4.0 +typing_extensions 4.4.0 +tzdata 2023.3 +ujson 5.4.0 +unidecode 1.2.0 +upsetplot 0.8.0 +urllib3 1.26.14 +utils 1.0.1 +vc 14.2 +vs2015_runtime 14.27.29016 +w3lib 1.21.0 +watchdog 2.1.6 +wcwidth 0.2.5 +webencodings 0.5.1 +websocket-client 0.57.0 +werkzeug 2.2.2 +whatthepatch 1.0.2 +wheel 0.40.0 +widgetsnbextension 3.5.2 +win_inet_pton 1.1.0 +wincertstore 0.2 +winpty 0.4.3 +wrapt 1.14.1 +xarray 2022.11.0 +xlwings 0.29.1 +xz 5.2.10 +yaml 0.2.5 +yapf 0.31.0 +zeromq 4.3.4 +zfp 0.5.5 +zict 2.1.0 +zipp 3.11.0 +zlib 1.2.13 +zope 1.0 +zope.interface 5.4.0 +zstandard 0.19.0 +zstd 1.5.2 diff --git a/subject2-pre/post_fl.py b/subject2-pre/post_fl.py new file mode 100644 index 0000000000000000000000000000000000000000..fab6ad925b4fd88fa1bc8a8f939c7d17de80e8f8 --- /dev/null +++ b/subject2-pre/post_fl.py @@ -0,0 +1,230 @@ +import numpy as np +import pandas as pd +import torch +import torch.nn as nn +from sklearn.preprocessing import StandardScaler, MinMaxScaler +import argparse + + +# 数据加载 +def load_data(data_path): + dataset = pd.read_csv(data_path) + return dataset +# 加载数据 +data_path = 'processed_alibaba.csv' +data = load_data(data_path) +# 选择需要训练和测试的特征列和目标列 +data = data[['cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan', 'mem_plan']] + + +# 标准化 +scaler = MinMaxScaler(feature_range=(0, 1)) +data = scaler.fit_transform(data.values) + +u = 0.1 +# 定义迁移学习的训练轮数 +num_transfer_epochs = 100 + +# 定义学习任务的设置 +def parser_args(): + parser = argparse.ArgumentParser(description='GRU Model Parameters') + parser.add_argument('--timestep', type=int, default=4, help='Time step size') + parser.add_argument('--batch_size', type=int, default=32, help='Batch size') + parser.add_argument('--feature_size', type=int, default=6, help='Number of features') + parser.add_argument('--hidden_size', type=int, default=512, help='Size of the hidden layer') + parser.add_argument('--output_size', type=int, default=3, help='Size of the output layer') + parser.add_argument('--num_layers', type=int, default=2, help='Number of GRU layers') + parser.add_argument('--epochs', type=int, default=10, help='Number of training epochs') + parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate') + parser.add_argument('--model_name', type=str, default='gru', help='Name of the model') + parser.add_argument('--save_path', type=str, default='./{}.pth'.format('gru'), help='Path to save the best model') + parser.add_argument('--no_cuda', action='store_true', default=False, help='Disable CUDA') + args = parser.parse_args() + return args +args = parser_args() + + +#新模型 +class Config(): + + timestep = 4 # 时间步长,就是利用多少时间窗口 + batch_size = 32 # 批次大小 + feature_size = 6 # 每个步长对应的特征数量,这里使用8维特征 + hidden_size = 512 # 隐层大小 + output_size = 3 # 由于是单输出任务 + num_layers = 4 # gru的层数 + epochs = 32 # 迭代轮数 + best_loss = float('inf') # 记录损失,初始化为正无穷大 + learning_rate = 0.001 # 学习率 + model_name = 'gru' # 模型名称 + save_path = './{}.pth'.format(model_name) # 最优模型保存路径 + +config = Config() + +# 划分数据 +def split_data(data, timestep,target_size): + dataX = [] + dataY = [] + for index in range(len(data) - timestep): + dataX.append(data[index: index + timestep]) + dataY.append(data[index + timestep, :target_size]) + dataX = np.array(dataX) + dataY = np.array(dataY) + train_size = int(np.round(0.7*dataX.shape[0])) + x_train = dataX[:train_size, :] + y_train = dataY[:train_size, :] + x_test = dataX[train_size:, :] + y_test = dataY[train_size:, :] + + return x_train, y_train, x_test, y_test + +# 定义网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + self.activation = nn.ReLU() # 添加激活函数 + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + output, h_0 = self.gru(x, h_0) + output = self.fc(output[:, -1, :]) + output = self.activation(output) # 添加激活函数 + return output, h_0 + + + + +# 获取训练数据 +x_train, y_train, x_test, y_test = split_data(data, config.timestep, config.feature_size) +target_size = 3 # 三个目标列的大小 +x_train, y_train, x_test, y_test = split_data(data, args.timestep,target_size) + +#形成训练数据集 +x_train_tensor = torch.Tensor(x_train) +y_train_tensor = torch.Tensor(y_train) +x_test_tensor = torch.Tensor(x_test) +y_test_tensor = torch.Tensor(y_test) + +# 假设您的模型类是GRU,config中包含相关的模型配置信息 +pre_trained_model = GRU(args.feature_size, args.hidden_size, args.num_layers, args.output_size) +# 加载预训练模型的参数 +# 加载预训练模型的参数 +model_path = "gru_a.pth" +pre_trained_model = torch.load(model_path, map_location='cpu') + +transfer_model = GRU(config.feature_size, config.hidden_size, config.num_layers,config.output_size) # 定义GRU网络 + +loss_function = nn.MSELoss() # 定义损失函数 +optimizer_transfer = torch.optim.AdamW(transfer_model.parameters(), lr= config.learning_rate) # 定义优化器 + + +# 将公共模型的参数设为不可训练 +for param in pre_trained_model.parameters(): + param.requires_grad = False +print(y_train_tensor) + +# train_dataset = TensorDataset(x_train_tensor, y_train_tensor) +# # 创建 DataLoader +# train_loader = DataLoader(train_dataset, batch_size=config.batch_size, shuffle=True) + +for epoch in range(num_transfer_epochs): + + transfer_model.train() + running_loss = 0 + + optimizer_transfer.zero_grad() + # 手动清除隐藏状态 + hidden = None + + # 从数据中获取输入和目标张量 + inputs = x_train_tensor + targets = y_train_tensor + + y_train_pred,hidden = transfer_model(inputs) + y_pretrained_pred,hidden = pre_trained_model(inputs) + + loss = (1 - u) * loss_function(y_train_pred, targets) + u * loss_function(y_train_pred, y_pretrained_pred) + # 反向传播和参数更新 + loss.backward() + optimizer_transfer.step() + + running_loss += loss.item() + + # 将运行中的损失打印出来,以便监控训练过程 + print(f"Epoch {epoch+1}/{num_transfer_epochs}, Loss: {running_loss}") + + +# 保存迁移模型 +transfer_model_save_path = './ta_model.pth' # 定义迁移模型保存路径 +torch.save(transfer_model.state_dict(), transfer_model_save_path) + +print('Finished Transfer Learning') + +# 测试循环 +transfer_model.eval() # 将模型设置为评估模式 + +# with torch.no_grad(): # 在测试期间禁用梯度计算 +# test_inputs = x_test_tensor +# test_targets = y_test_tensor +# +# test_predictions, _ = transfer_model(test_inputs) +# test_loss = loss_function(test_predictions, test_targets) +# + + +with torch.no_grad(): # 在测试期间禁用梯度计算 + test_inputs = x_test_tensor + test_targets = y_test_tensor + + test_predictions, _ = transfer_model(test_inputs) + test_loss = loss_function(test_predictions, test_targets) + + + # 创建一个文件保存输入数据 + with open('input_data.txt', 'w') as input_file: + # 打印每次的数据输入和预测结果 + for i in range(len(test_inputs)): + input_data = test_inputs[i].numpy() + target_data = test_targets[i].numpy() + predicted_data = test_predictions[i].numpy() + + print(f"样本 {i + 1}:") + print("输入数据:") + for row in input_data: + print(" ".join(map(str, row))) + + print("目标数据:", target_data) + print("预测数据:", predicted_data) + print("----------") + + # 将输入数据写入文件 + input_file.write(f"样本 {i + 1}:\n") + input_file.write("输入数据:\n") + for row in input_data: + input_file.write(" ".join(map(str, row)) + "\n") + + input_file.write("目标数据:" + repr(target_data.tolist()) + "\n") + input_file.write("预测数据:" + repr(predicted_data.tolist()) + "\n") + input_file.write("----------\n") + +# print(f"测试损失: {test_loss.item()}") + +# # 将测试结果保存到CSV文件 +# result_df = pd.DataFrame({ +# 'Actual_cpu': test_targets[:, 0].numpy(), +# 'Actual_gpu': test_targets[:, 1].numpy(), +# 'Actual_mem': test_targets[:, 2].numpy(), +# 'Predicted_cpu': test_predictions[:, 0].numpy(), +# 'Predicted_gpu': test_predictions[:, 1].numpy(), +# 'Predicted_mem': test_predictions[:, 2].numpy(), +# }) +# +# result_df.to_csv('pa_results.csv', index=False) \ No newline at end of file diff --git a/subject2-pre/server.py b/subject2-pre/server.py new file mode 100644 index 0000000000000000000000000000000000000000..bcb74aa79ffde7d9ec63a46c3819f47046702d48 --- /dev/null +++ b/subject2-pre/server.py @@ -0,0 +1,164 @@ +import socket +import pickle +import torch.nn as nn +import argparse +from concurrent.futures import ThreadPoolExecutor + + + +# 定义神经网络 +class GRU(nn.Module): + def __init__(self, feature_size, hidden_size, num_layers, output_size): + super(GRU, self).__init__() + self.hidden_size = hidden_size + self.num_layers = num_layers + self.gru = nn.GRU(feature_size, hidden_size, num_layers, batch_first=True) + self.fc = nn.Linear(hidden_size, output_size) + self.activation = nn.ReLU() + + def forward(self, x, hidden=None): + batch_size = x.shape[0] + if hidden is None: + h_0 = x.data.new(self.num_layers, batch_size, self.hidden_size).fill_(0).float() + else: + h_0 = hidden + output, h_0 = self.gru(x, h_0) + output = self.fc(output[:, -1, :]) + output = self.activation(output) + return output, h_0 + +# 定义学习任务的设置 +def parser_args(): + parser = argparse.ArgumentParser(description='GRU Model Parameters') + parser.add_argument('--timestep', type=int, default=4, help='Time step size') + parser.add_argument('--batch_size', type=int, default=32, help='Batch size') + parser.add_argument('--feature_size', type=int, default=6, help='Number of features') + parser.add_argument('--hidden_size', type=int, default=512, help='Size of the hidden layer') + parser.add_argument('--output_size', type=int, default=3, help='Size of the output layer') + parser.add_argument('--num_layers', type=int, default=2, help='Number of GRU layers') + parser.add_argument('--epochs', type=int, default=3, help='Number of training epochs') + parser.add_argument('--learning_rate', type=float, default=0.0001, help='Learning rate') + parser.add_argument('--model_name', type=str, default='gru', help='Name of the model') + parser.add_argument('--save_path', type=str, default='./{}.pth'.format('gru'), help='Path to save the best model') + parser.add_argument('--no_cuda', action='store_true', default=False, help='Disable CUDA') + args = parser.parse_args() + return args + +args = parser_args() +model = GRU(args.feature_size, args.hidden_size, args.num_layers, args.output_size) + +# 创建服务器套接字 +server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +# 绑定服务器地址和端口 +# server_address = ('127.0.0.1', 4321) +server_address = ('202.117.43.11', 8080) +server.bind(server_address) +# 开始监听,允许最多客户端连接 +server.listen(5) +clients = [] +c_weights = {} # 用于保存每个客户端的权重数据 +c_length= {} + + +while True: + print("等待客户端连接...") + client, addr = server.accept() + print(f"接受来自 {addr} 的连接") + clients.append(client) + if len(clients) == 2: + break + +ROUNDS = 20 +length = len(pickle.dumps(model.state_dict())) +chunk_size = 1024 + + +for round in range(ROUNDS): + + print("ROUND %d" % (round), end='\n') + # 获取当前模型的权重 + model_weights = model.state_dict() + model_weights_serialized = pickle.dumps(model_weights) + print(f"Serialized model weights size: {len(model_weights_serialized)} bytes") + + for client in clients: + print('接受客户端数据长度:') + print(client) + size_message = len(model_weights_serialized).to_bytes(4096, byteorder='big') # 将整数转换为字节数组 + client.send(size_message) + + # 向所有客户端发送当前模型的权重 + print('向客户端发送当前模型的权重') + for client in clients: + total_sent = 0 + for i in range(0, len(model_weights_serialized), chunk_size): + chunk = model_weights_serialized[i:i + chunk_size] + sent = client.send(chunk) + total_sent += sent + + + + def receive_data_from_client(client, c_length): + + print(f'接受客户端数据长度:{client}') + size_message = client.recv(4096) + length = int.from_bytes(size_message, byteorder='big') + c_length[client] = length + + print(f'接受客户端数据:{client}') + received_weights = b"" + while len(received_weights) < c_length[client]: + chunk = client.recv(4096) + received_weights += chunk + + if len(received_weights) == length: + try: + received_weights = pickle.loads(received_weights) + print('解析成功') + c_weights[client] = received_weights + except EOFError: + print('解析失败: EOFError') + c_weights[client] = c_weights[client] + else: + print('接收到的数据长度不符合预期,可能存在问题') + + + + # 创建ThreadPoolExecutor + with ThreadPoolExecutor(max_workers=len(clients)) as executor: + # 提交任务给线程池 + futures = [executor.submit(receive_data_from_client, client, c_length) for client in clients] + + # 等待所有任务完成 + for future in futures: + future.result() + + + # 计算平均权重 + aggregated_weights = model.state_dict() + # print(aggregated_weights) + + + # 将所有值清零 + for key in aggregated_weights.keys(): + aggregated_weights[key].zero_() + + # 累积权重 + for weights in c_weights.values(): + for key, value in weights.items(): + if key not in aggregated_weights: + aggregated_weights[key] = value.clone() + else: + aggregated_weights[key] += value + + # 计算平均值 + for key in aggregated_weights.keys(): + aggregated_weights[key] /= len(c_weights) + + # 将聚合后的权重应用到模型中 + model.load_state_dict(aggregated_weights) + print("权重聚合完毕") + +# 关闭服务器套接字 +server.close() + diff --git a/subject2-pre/tvhl.py b/subject2-pre/tvhl.py new file mode 100644 index 0000000000000000000000000000000000000000..421a73cc1b97c9b065546ddf8f00cb0206b93294 --- /dev/null +++ b/subject2-pre/tvhl.py @@ -0,0 +1,200 @@ +import numpy as np +import pandas as pd +from tensorflow.keras.models import Sequential +from sklearn.preprocessing import MinMaxScaler +from tensorflow.keras.layers import Dense, LeakyReLU +import tensorflow as tf +from scipy.spatial.distance import cdist +import requests +import csv +from io import StringIO +import datetime + +def get_data_from_api(api_url): + """ + 从API获取数据的函数接口 + 参数: + - api_url: API的地址 + 返回值: + - 如果成功,返回API响应的JSON数据 + - 如果失败,返回None + """ + try: + # 发起GET请求 + response = requests.get(api_url) + + # 检查响应状态码 + if response.status_code == 200: + # 解析JSON数据并返回 + return response.json() + else: + # 打印错误信息,并返回None + print(f"Error: {response.status_code} - {response.text}") + return None + except requests.RequestException as e: + # 打印异常信息,并返回None + print(f"Request Error: {e}") + return None + +def json_data_to_csv(json_data): + try: + # 提取表头(CSV文件的列名) + header = list(json_data['pageData'][0].keys()) + # 创建一个内存中的文件对象,用于写入CSV数据 + csv_output = StringIO() + # 创建CSV写入器 + csv_writer = csv.DictWriter(csv_output, fieldnames=header) + # 写入表头 + csv_writer.writeheader() + # 写入数据 + csv_writer.writerows(json_data['pageData']) + # 获取CSV数据作为字符串 + csv_data = csv_output.getvalue() + print("转换成功:JSON 数据到 CSV 字符串") + return csv_data + except Exception as e: + print(f"转换失败:{e}") + return None + +#创建生成器 +def build_generator(): + model = Sequential() + model.add(Dense(64, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(32)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(7, activation='sigmoid')) + return model +generator = build_generator() + +#创建鉴别器 +def build_discriminator(): + model = Sequential() + model.add(Dense(32, input_dim=7)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(16)) + model.add(LeakyReLU(alpha=0.2)) + model.add(Dense(1, activation='sigmoid')) + return model +discriminator = build_discriminator() + + +#建立损失函数 +def compute_joint_distribution_loss(real_samples, fake_samples): + real_relative_time = real_samples[:, 0] + fake_relative_time = fake_samples[:, 0] + real_samples = real_samples[:, 1:] + fake_samples = fake_samples[:, 1:] + distance =10 * cdist(real_samples, fake_samples, metric='euclidean') + conditional_distance = np.abs(real_relative_time - fake_relative_time) + joint_distribution_loss = np.mean(distance * conditional_distance) + return joint_distribution_loss + + +#训练模型 +def train_gan(noise_data, real_data, epochs, batch_size): + for epoch in range(epochs): + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + real_samples = real_data[np.random.randint(0, real_data.shape[0], size=batch_size)] + generated_samples = generator.predict(noise) + # print("Real Samples Shape:", real_samples.shape) + # print("Generated Samples Shape:", generated_samples.shape) + X = np.concatenate((real_samples, generated_samples)) + y = np.ones(2 * batch_size) + y[batch_size:] = 0 + discriminator.trainable = True + discriminator_loss = discriminator.train_on_batch(X, y) + noise = noise_data[np.random.randint(0, noise_data.shape[0], size=batch_size)] + y_gen = np.ones(batch_size) + discriminator.trainable = False + generator_loss = gan.train_on_batch(noise, y_gen) + # 计算联合分布差异项 + joint_distribution_loss = compute_joint_distribution_loss(real_samples, generated_samples) + # 更新判别器的损失函数 + discriminator_loss = discriminator_loss +1*joint_distribution_loss + # 打印损失和精度 + if epoch % 100 == 0: + print( + f'Epoch: {epoch}/{epochs}, Discriminator Loss: {discriminator_loss}, Generator Loss: {generator_loss}') + + +def gene_data(generated_data): + # 将时间恢复到24小时制 + generated_data['time'] = (generated_data['time'] * 24).astype(int) + + data_by_hour = {} + # 将generated_data按时间分组,放入字典中对应的键中 + for i in range(24): + data_by_hour[i] = generated_data[generated_data['time'] == i] + empty_df = pd.DataFrame() + + # 遍历1到23小时,检查每个小时的数据是否为空 + for i in range(0, 24): + if data_by_hour[i].empty: + next_i = (i + 1) % 24 # 计算下一个小时的索引,使用取模运算确保最后一个小时的数据获取方式是循环到小时1的数据 + data_by_hour[i] = data_by_hour[next_i].head(1) + data_length = len(data_by_hour[i]) + if data_length < 31: + data_by_hour[i] = pd.concat([data_by_hour[i], data_by_hour[5].head(30)]) + for i in range(0, 24): + empty_df = pd.concat([empty_df, data_by_hour[i].head(30)]) + + # 复制empty_df的第一行数据,并赋值给time_series_data + time_series_data = empty_df.iloc[:1].copy() + for i in range(1, 60): + for j in range(0, 24): + time_series_data = pd.concat([time_series_data, empty_df.iloc[j * 24 + i:j * 24 + i + 1].copy()]) + time_series_data = time_series_data.copy() + time_series_data['time'] = range(len(time_series_data)) + return time_series_data + +def set_noise(): + uniform_noise = np.random.uniform(0, 1, size=(10000, 1)) + normal_noise = np.random.normal(0, 1, size=(10000, 6)) + noise = np.hstack((uniform_noise, normal_noise)) + return noise + + +#从API获取历史数据 +# api_url = "http://202.117.43.38:28081/jcce/v1.0/job/list" +# data = get_data_from_api(api_url) +data = pd.read_csv('pre_microsoft.csv') + +# 使用提取的数据进行CSV转换 +# data = json_data_to_csv(data) +# data = pd.read_csv(StringIO(data), parse_dates=["startTime", "endTime", "arriveTime"]) +current_time = datetime.datetime.now() +# print(current_time) +# data["startTime"] = np.round((data["startTime"] - current_time) / np.timedelta64(1, "h") + 100) +#提取特征 +features = ['time', 'cpu_use', 'gpu_use', 'mem_use', 'cpu_plan', 'gpu_plan','mem_plan'] +data = data[features] +# 将相同时间的数据合并 +# data = data.groupby("startTime").sum().reset_index() +data['time'] = data['time'] % 24 +#根据接口进行修改 + + + +#归一化处理 +data = data[features].values +scaler = MinMaxScaler(feature_range=(0, 1)) +data_scaled = scaler.fit_transform(data) + +#设置数据生成参数 +discriminator.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) +discriminator.trainable = False +gan_input = tf.keras.Input(shape=(7,)) +gan_output = discriminator(generator(gan_input)) +gan = tf.keras.Model(gan_input, gan_output) +gan.compile(loss='binary_crossentropy', optimizer='adam') + +noise = set_noise() +#训练模型 +train_gan(data, noise, epochs=1000, batch_size=128) +# 数据生成和处理 +generated_data = pd.DataFrame(generator.predict(noise), columns=features) +generated_data = gene_data(generated_data) + +# 保存数据 +generated_data.to_csv('tvhl.csv', index=False) \ No newline at end of file