DataBlock API to construct the DataLoaders

We will create a DataBlock to process our UCR datasets

ucr_path = untar_data(URLs.UCR)
df_train, df_test = load_df_ucr(ucr_path, 'StarLightCurves')
Loading files from: /home/tcapelle/.fastai/data/Univariate2018_arff/StarLightCurves
df_train.head()
att1 att2 att3 att4 att5 att6 att7 att8 att9 att10 ... att1016 att1017 att1018 att1019 att1020 att1021 att1022 att1023 att1024 target
0 0.537303 0.531103 0.528503 0.529403 0.533603 0.540903 0.551103 0.564003 0.579603 0.597603 ... 0.546903 0.545903 0.543903 0.541003 0.537203 0.532303 0.526403 0.519503 0.511403 b'3'
1 0.588398 0.593898 0.599098 0.604098 0.608798 0.613397 0.617797 0.622097 0.626097 0.630097 ... 0.237399 0.246499 0.256199 0.266499 0.277399 0.288799 0.300899 0.313599 0.326899 b'3'
2 -0.049900 -0.041500 -0.033400 -0.025600 -0.018100 -0.010800 -0.003800 0.003000 0.009600 0.015900 ... -0.173801 -0.161601 -0.149201 -0.136401 -0.123201 -0.109701 -0.095901 -0.081701 -0.067100 b'1'
3 1.337005 1.319805 1.302905 1.286305 1.270005 1.254005 1.238304 1.223005 1.208104 1.193504 ... 1.288905 1.298505 1.307705 1.316505 1.324905 1.332805 1.340205 1.347005 1.353205 b'3'
4 0.769801 0.775301 0.780401 0.785101 0.789401 0.793301 0.796801 0.799901 0.802601 0.805101 ... 0.742401 0.744501 0.747301 0.750701 0.754801 0.759501 0.765001 0.771301 0.778401 b'3'

5 rows × 1025 columns

x_cols = df_train.columns[slice(0,-1)].to_list()
x_cols[0:5]
['att1', 'att2', 'att3', 'att4', 'att5']

class TabularTS[source]

TabularTS(df, procs=None, x_names=None, y_names=None, block_y=None, splits=None, do_setup=True, device=None, inplace=False) :: CollBase

A DataFrame wrapper that knows which cols are x/y, and returns rows in __getitem__

tts = TabularTS(df_train, x_names=x_cols, y_names='target')
tts.iloc[0:4]
       att1      att2      att3      att4      att5      att6      att7  \
0  0.537303  0.531103  0.528503  0.529403  0.533603  0.540903  0.551103   
1  0.588398  0.593898  0.599098  0.604098  0.608798  0.613397  0.617797   
2 -0.049900 -0.041500 -0.033400 -0.025600 -0.018100 -0.010800 -0.003800   
3  1.337005  1.319805  1.302905  1.286305  1.270005  1.254005  1.238304   

       att8      att9     att10  ...   att1016   att1017   att1018   att1019  \
0  0.564003  0.579603  0.597603  ...  0.546903  0.545903  0.543903  0.541003   
1  0.622097  0.626097  0.630097  ...  0.237399  0.246499  0.256199  0.266499   
2  0.003000  0.009600  0.015900  ... -0.173801 -0.161601 -0.149201 -0.136401   
3  1.223005  1.208104  1.193504  ...  1.288905  1.298505  1.307705  1.316505   

    att1020   att1021   att1022   att1023   att1024  target  
0  0.537203  0.532303  0.526403  0.519503  0.511403    b'3'  
1  0.277399  0.288799  0.300899  0.313599  0.326899    b'3'  
2 -0.123201 -0.109701 -0.095901 -0.081701 -0.067100    b'1'  
3  1.324905  1.332805  1.340205  1.347005  1.353205    b'3'  

[4 rows x 1025 columns]

class TSPandas[source]

TSPandas(df, procs=None, x_names=None, y_names=None, block_y=None, splits=None, do_setup=True, device=None, inplace=False) :: TabularTS

A DataFrame wrapper that knows which cols are x/y, and returns rows in __getitem__

to = TSPandas(df_train, x_names=x_cols, y_names='target')
to.iloc[0:5]
       att1      att2      att3      att4      att5      att6      att7  \
0  0.537303  0.531103  0.528503  0.529403  0.533603  0.540903  0.551103   
1  0.588398  0.593898  0.599098  0.604098  0.608798  0.613397  0.617797   
2 -0.049900 -0.041500 -0.033400 -0.025600 -0.018100 -0.010800 -0.003800   
3  1.337005  1.319805  1.302905  1.286305  1.270005  1.254005  1.238304   
4  0.769801  0.775301  0.780401  0.785101  0.789401  0.793301  0.796801   

       att8      att9     att10  ...   att1016   att1017   att1018   att1019  \
0  0.564003  0.579603  0.597603  ...  0.546903  0.545903  0.543903  0.541003   
1  0.622097  0.626097  0.630097  ...  0.237399  0.246499  0.256199  0.266499   
2  0.003000  0.009600  0.015900  ... -0.173801 -0.161601 -0.149201 -0.136401   
3  1.223005  1.208104  1.193504  ...  1.288905  1.298505  1.307705  1.316505   
4  0.799901  0.802601  0.805101  ...  0.742401  0.744501  0.747301  0.750701   

    att1020   att1021   att1022   att1023   att1024  target  
0  0.537203  0.532303  0.526403  0.519503  0.511403    b'3'  
1  0.277399  0.288799  0.300899  0.313599  0.326899    b'3'  
2 -0.123201 -0.109701 -0.095901 -0.081701 -0.067100    b'1'  
3  1.324905  1.332805  1.340205  1.347005  1.353205    b'3'  
4  0.754801  0.759501  0.765001  0.771301  0.778401    b'3'  

[5 rows x 1025 columns]

setups[source]

setups(to:TabularTS)

encodes[source]

encodes(to:TabularTS)

decodes[source]

decodes(to:TabularTS)

class NormalizeTS[source]

NormalizeTS(enc=None, dec=None, split_idx=None, order=None) :: TabularProc

Normalize the x variables.

setups[source]

setups(to:TabularTS)

encodes[source]

encodes(to:TabularTS)

decodes[source]

decodes(to:TabularTS)

norm = Normalize()
df = df_train.loc[:, [x_cols[0]]]
to = TSPandas(df, norm, x_names=x_cols[0])
x = df.values.squeeze()
m,s = x.mean(),x.std()
test_eq(norm.means[x_cols[0]], m)
test_close(norm.stds[x_cols[0]], s)
test_close(to[x_cols[0]].values, (x-m)/s)

class ReadTSBatch[source]

ReadTSBatch(to) :: ItemTransform

A transform that always take tuples as items

to = TSPandas(df_train, None, x_names=x_cols, y_names='target')
to.procs
Pipeline: Categorize -- {'vocab': None, 'sort': True, 'add_na': False}

Let's check we get the encoded batch

rtsb = ReadTSBatch(to)
x,y = rtsb.encodes(to.iloc[0:16])
x.shape, y.shape
(torch.Size([16, 1, 1024]), torch.Size([16, 1]))

This function needs to be redone

class TabularTSDataloader[source]

TabularTSDataloader(dataset, bs=16, shuffle=False, after_batch=None, num_workers=0, verbose=False, do_setup=True, pin_memory=False, timeout=0, batch_size=None, drop_last=False, indexed=None, n=None, device=None, persistent_workers=False, wif=None, before_iter=None, after_item=None, before_batch=None, after_iter=None, create_batches=None, create_item=None, create_batch=None, retain=None, get_idxs=None, sample=None, shuffle_fn=None, do_batch=None) :: TfmdDL

Transformed DataLoader

splits = RandomSplitter()(range_of(df_train))
to = TSPandas(df_test, norm, x_names=x_cols, y_names='target', splits=splits)
test_dl = TabularTSDataloader(to)

Quick function to test the performance of the DL

def cycle_dl(dl):
    for x,y in iter(dl):
        pass
%time cycle_dl(test_dl)
CPU times: user 80.2 ms, sys: 11 µs, total: 80.2 ms
Wall time: 80 ms

stack_train_valid[source]

stack_train_valid(df_train, df_valid)

Stack df_train and df_valid, adds valid_col=True/False for df_valid/df_train

Integration Example

from timeseries_fastai.models import create_inception
df_main = stack_train_valid(df_train, df_test).iloc[0:128]
splits=[list(range(96)), list(range(96, 128))]
to = TSPandas(df_main, norm, x_names=x_cols, y_names='target', splits=splits)
dls = to.dataloaders(32, 128)
inception = create_inception(1, len(dls.vocab))
learn = Learner(dls, inception, metrics=[accuracy])
learn.fit_one_cycle(1)
epoch train_loss valid_loss accuracy time
0 0.943485 1.096740 0.625000 00:00