pypots tutorial
python
pypots
time series
pypots은 시계열 처리(결측, 예측, 분류, 군집)을 쉽게 처리할 수 있도록 만든 패키지
pypots 설치
# by pip
pip install pypots # the first time installation
pip install pypots --upgrade # update pypots to the latest version
# by conda
conda install -c conda-forge pypots # the first time installation
conda update -c conda-forge pypots # update pypots to the latest version
preparing data
from pypots.data.generating import gene_physionet2012
= gene_physionet2012(artificially_missing_rate=0.1) physionet2012_dataset
= {
dataset_for_training "X": physionet2012_dataset['train_X'],
}
= {
dataset_for_validating "X": physionet2012_dataset['val_X'],
"X_intact": physionet2012_dataset['val_X_intact'],
"indicating_mask": physionet2012_dataset['val_X_indicating_mask'],
}
= {
dataset_for_testing "X": physionet2012_dataset['test_X'],
}
결측치 처리(impute)
from pypots.optim import Adam
from pypots.imputation import SAITS
# initialize the model
= SAITS(
saits =physionet2012_dataset['n_steps'],
n_steps=physionet2012_dataset['n_features'],
n_features=2,
n_layers=256,
d_model=128,
d_inner=4,
n_heads=64,
d_k=64,
d_v=0.1,
dropout=0.1,
attn_dropout=True, # otherwise the original self-attention mechanism will be applied
diagonal_attention_mask=1, # you can adjust the weight values of arguments ORT_weight
ORT_weight# and MIT_weight to make the SAITS model focus more on one task. Usually you can just leave them to the default values, i.e. 1.
=1,
MIT_weight=32,
batch_size# here we set epochs=10 for a quick demo, you can set it to 100 or more for better performance
=10,
epochs# here we set patience=3 to early stop the training if the evaluting loss doesn't decrease for 3 epoches.
# You can leave it to defualt as None to disable early stopping.
=3,
patience# give the optimizer. Different from torch.optim.Optimizer, you don't have to specify model's parameters when
# initializing pypots.optim.Optimizer. You can also leave it to default. It will initilize an Adam optimizer with lr=0.001.
=Adam(lr=1e-3),
optimizer# this num_workers argument is for torch.utils.data.Dataloader. It's the number of subprocesses to use for data loading.
# Leaving it to default as 0 means data loading will be in the main process, i.e. there won't be subprocesses.
# You can increase it to >1 if you think your dataloading is a bottleneck to your model training speed
=0,
num_workers# just leave it to default, PyPOTS will automatically assign the best device for you.
# Set it to 'cpu' if you don't have CUDA devices. You can also set it to 'cuda:0' or 'cuda:1' if you have multiple CUDA devices.
='cpu',
device# set the path for saving tensorboard and trained model files
="tutorial_results/imputation/saits",
saving_path# only save the best model after training finished.
# You can also set it as "better" to save models performing better ever during training.
="best",
model_saving_strategy )
# train the model on the training set, and validate it on the validating set to select the best model for testing in the next step
=dataset_for_training, val_set=dataset_for_validating) saits.fit(train_set
# the testing stage, impute the originally-missing values and artificially-missing values in the test set
= saits.impute(dataset_for_testing) saits_imputation
from pypots.utils.metrics import cal_mae
# calculate mean absolute error on the ground truth (artificially-missing values)
= cal_mae(
testing_mae 'test_X_intact'], physionet2012_dataset['test_X_indicating_mask'])
saits_imputation, physionet2012_dataset[print("Testing mean absolute error: %.4f" % testing_mae)