Add a linear CRF layer on the top of a bi-lstm

francesco_patane · February 1, 2023, 5:55pm

hi there! i’m creating a bi-LSTM with an attention layer for a biotechnology project involving vaccine discovery. the aim is to predict membrane protein topology and identify protein segments that stay outer the cell. on the top of this net i would add a CRF layer. this because i want eliminate impossible transitions like in-out and out-in.

import torch 
import pandas as pd 
import torch.nn as nn 
from torch.utils.data import random_split, DataLoader, TensorDataset 
import torch.nn.functional as F 
import numpy as np 
import torch.optim as optim 
from torch.optim import Adam 
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences

df = pd.read_csv('line.csv')
amino_dict = {'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19}
sequences = df["sequence"]
encoded_sequences = [to_categorical([amino_dict[amino] for amino in seq], num_classes=20) for seq in sequences]

topo_dict = {'O': 0, 'I': 1, 'M': 2}
topos = df['label']
encoded_topos = [to_categorical([topo_dict[label] for label in seq], num_classes=3) for seq in topos]

encoded_sequences = pad_sequences(encoded_sequences, maxlen=1000, padding='post', dtype='float32')
encoded_topos = pad_sequences(encoded_topos, maxlen=1000, padding='post', dtype='float32')

input = encoded_sequences
output = encoded_topos

# Convert Input and Output data to Tensors and create a TensorDataset 
X = torch.Tensor(input)      # Create tensor of type torch.float32 
print('\nInput format: ', X.shape, X.dtype)     # Input format: torch.Size([150, 4]) torch.float32 
y = torch.tensor(output)        # Create tensor type torch.int64  
print('Output format: ', y.shape, y.dtype)  # Output format: torch.Size([150]) torch.int64 
data = TensorDataset(X, y)    # Create a torch.utils.data.TensorDataset object for further data manipulation

sequence_lenght = 1000
input_size = 20
no_classes = 3
hidden_size = 20 #tunable
num_epochs = 1000 #tunable
batch_size = 16 #
lr = 0.002 #

bidirectional = True 
#lstm_units = 50
num_layers = 2  #tunable
#dropout= 0.2

class LSTM(nn.Module):
  def __init__(self, input_size, hidden_size, no_classes, num_layers):
    super(LSTM, self).__init__()

    self.hidden_size = hidden_size
    self.num_layers = num_layers

    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, bidirectional=True)

    self.attention = nn.Linear(hidden_size*2, 1)
    
    self.fc1 = nn.Linear(hidden_size*2, no_classes)
    #self.relu = nn.ReLU()
    #self.dropout = nn.Dropout(dropout)
    #self.softmax = nn.Softmax(dim=1)#
    #self.argmax = nn.argmax(dim=1)
    #self.crf = CRF(no_classes, batch_first=True)
    

    
  
  def forward(self, x):

    out, _ = self.lstm(x)  

    attention_weights = self.attention(out) 
    attention_weights = attention_weights.view(-1, 1)
    #attention_weights = self.softmax(attention_weights)#
    attention_applied = torch.sum(out * attention_weights.unsqueeze(-1), dim=0)
    out = self.fc1(attention_applied)    #[:, -1, :])
    #out = self.softmax(out)
    #soft = torch.softmax(out, dim=1)
    
    #out = torch.argmax(soft, dim=1)
    #out = self.crf(out, tags=no_classes)
    

    return out



    # Crea il modello
model = LSTM(input_size, hidden_size, no_classes, num_layers)

# Definire la funzione di perdita e l'ottimizzatore
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

# Trasferisci il modello al dispositivo
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Addestra il modello
for epoch in range(num_epochs):
    for i, (sequences, labels) in enumerate(data):
      sequences = sequences.to(device)
      labels = labels.to(device)

    # Azzera i gradienti
    optimizer.zero_grad()

    # Forward pass
    outputs = model(sequences)
    loss = criterion(outputs, labels)
    

    # Backward pass e ottimizzazione
    loss.backward()
    optimizer.step()

#if (i + 1) % 100 == 0:
print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(data) // batch_size}], Loss: {loss.item():.4f}')

can you help me?

vdw · February 2, 2023, 1:26am

There is a PyTorch tutorial for that.

francesco_patane · February 9, 2023, 10:37am

thank you !

i adapted the tutorial for my scope but i have issues with batch_size parameter. in practice i would compute the loss not on every single sequence but on batches of x sequences. when i create the dataset with TensorDataset and load it with dataloader(batch_size=64) don’t obtain batches of 64 sequences but batches with 64 characters.

i report the code:

import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import random_split, DataLoader, TensorDataset 

def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of
        # transitioning *to* i *from* j.
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the
                # scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

df = pd.read_csv('output_file.csv')

training_data = [(row.sequence, row.label) for row in df.itertuples()]

word_to_ix = {}
for sentence, tags in training_data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
sentence_in = prepare_sequence(sentence, word_to_ix)

tags = [t.strip() for t in tags]

START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 64
HIDDEN_DIM = 20
num_epochs = 50

tag_to_ix = {"O": 0, "I": 1, "M": 2, START_TAG: 3, STOP_TAG: 4}

targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

dataset = TensorDataset(sentence_in, targets)
dataloader = DataLoader(dataset, batch_size=64)
for batch in dataloader:
    sentence_in, targets = batch

START_TAG = "<START>"
STOP_TAG = "<STOP>"
EMBEDDING_DIM = 64
HIDDEN_DIM = 20
num_epochs = 50
batch_size = 64 #64 characters, not sequences

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
#model = model.to(device)
optimizer = optim.SGD(model.parameters(), lr=0.001, weight_decay=1e-4)


#model = model.to(device)


for epoch in range(
        num_epochs):  # again, normally you would NOT do 300 epochs, it is toy data
    for i,(sentence, tags) in enumerate(training_data):
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

         # Step 2. Get our inputs ready for the network, that is,
        # turn them into Tensors of word indices.
        sentence_in = prepare_sequence(sentence, word_to_ix)
        targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

        dataset = TensorDataset(sentence_in, targets)
        dataloader = DataLoader(dataset, batch_size=64)   #batch size

        for batch in dataloader:
          sentence_in, targets = batch

        # Step 3. Run our forward pass.
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. Compute the loss, gradients, and update the parameters by
        # calling optimizer.step()
        loss.backward()
        optimizer.step()


# Check predictions after training
        with torch.no_grad():
          #precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
          #precheck_sent = prepare_sequence(training_data, word_to_ix)
          #print(model(precheck_sent))
          
          #print(epoch+1,num_epochs, loss.data[0])
          #if (i + 1) % 100 == 0:
          #path = os.path.join(SAVE_DIR, 'model.pth')
          SAVE_DIR = '/content/'
          #https://discuss.pytorch.org/t/how-to-save-a-model-from-a-previous-epoch/20252/6
          path = os.path.join(SAVE_DIR, 'model.pth')

          torch.save(model.cpu().state_dict(), path) # saving model
          #model.cuda() # moving model to GPU for further training
          print(f'Epoch [{epoch + 1}/{num_epochs}], Step [{i + 1}/{len(training_data) // batch_size}], Loss: {loss.item():.4f}')

# We got it!

when i use:

for batch in dataloader:
    sentence_in, targets = batch
    print("sentence_in: ", sentence_in)
    print("targets: ", targets)

i get:

sentence_in:  tensor([ 0, 18, 13, 14,  0, 19,  1,  4, 14,  7,  6, 14, 10, 12, 17, 18,  5,  4,
        10,  3,  3,  3,  9, 14,  3, 10,  0,  1,  7, 11, 15, 15,  3,  6,  9,  7,
         4, 14, 14, 15,  8, 17,  4,  9, 11,  5, 15,  7,  7,  0,  4,  7, 14, 10,
        18, 11,  5,  5,  4,  5,  4,  2, 12, 19])
targets:  tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2])
sentence_in:  tensor([ 1, 15,  7, 11,  1,  5, 11, 11,  5,  8, 14, 10,  7, 16,  5,  5,  4,  5,
        14,  3, 14,  3, 10,  1,  2, 13,  7,  4, 14,  7, 10,  1,  5,  1,  7, 13,
        11, 13,  5,  1, 13,  8,  4,  3,  7, 13, 14,  9,  0,  0,  7,  2,  7,  7,
         5, 13,  5, 12,  5, 18,  4,  5, 14,  4])
targets:  tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
sentence_in:  tensor([11,  0, 15,  6,  3, 17,  0, 11,  7, 14,  9,  8, 11,  9,  8,  9,  9,  5,
         3, 11,  3, 11,  7,  8,  5,  6, 18, 15,  7,  7,  5,  5, 11, 11,  1,  5,
         5, 14,  8,  1,  5, 13, 10,  9, 15,  7, 17, 17,  7, 14,  1, 11,  7, 12,
         8,  7, 19, 14,  5,  8,  5,  5,  5,  1])
targets:  tensor([2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
sentence_in:  tensor([10,  5, 11, 18,  5,  7,  0, 10,  7,  6, 13,  6,  9, 18, 15, 11,  7, 15,
        10,  5,  5,  2, 10,  7,  8, 10,  1,  7, 15, 11,  5, 11, 15,  4,  3,  2,
        14,  9,  8,  1,  7, 12,  7,  7,  7,  0, 13,  5,  0, 13,  7, 16,  2,  1,
         5, 13, 14, 13, 10,  2, 15,  8,  3,  7])
targets:  tensor([2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1])
sentence_in:  tensor([ 5,  3,  6, 17,  1, 14, 18])
targets:  tensor([1, 1, 1, 1, 1, 1, 1])

input file (example):

MSNSKFNVRLLTEIAFMAALAFIISLIPNTVYGWIIVEIACIPILLLSLRRGLTAGLVGGLIWGILSMITGHAYILSLSQAFLEYLVAPVSLGIAGLFRQKTAPLKLAPVLLGTFVAVLLKYFFHFIAGIIFWSQYAWKGWGAVAYSLAVNGISGILTAIAAFVILIIFVKKFPKLFIHSNY	IIIIIIIIIIIMMMMMMMMMMMMMMMMMMMMOOOOMMMMMMMMMMMMMMIIIIMMMMMMMMMMMMMMMMMMMMOOOOOOMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIMMMMMMMMMMMMMMMMMMOOOOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIII
MTNNQKVKTLTYSAFMTAFIIILGFLPGIPIGFIPVPIILQNMGIMMAGGLLGPKYGTISVGAFLALALIGLPVLTGGNGGAASFLGPSGGYRIAWLFTPFLIGFFLKKLKITTSQNWFGELIIVLLFGVIFVDFVGAIWLSFQSNIPLLTSLISNLVFIPGDCIKAILTVVIVRRLRKQGGFELYFRK	IIIIIIIIIIIIIMMMMMMMMMMMMMMMMOOOOOOOOMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMOOOOOOOOOOOOOOOOOMMMMMMMMMMMMMMMMIIIIIIIIIIIIIIIIIMMMMMMMMMMMMMMMMMMOOOOOOOOOOOOMMMMMMMMMMMMMMMMMMMMMMIIIIIIIIIIIIIII

thank you for the help