Hello
I am trying to implement pytorch geometric and I have dimension mismatch to run the code. I am taking a batch of 32 texts. I take one text at a time and creates its adjacency matric and edge indices. Then i save all the 32 into one big batch of 32. But GCN gives error. Can someone please see the code below and correct me. thanks
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from transformers import DistilBertTokenizer
from transformers import DistilBertForSequenceClassification
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
# Load train data
train_data = pd.read_csv('https://raw.githubusercontent.com/salarMokhtariL/Facke-News-Detection/main/Dataset/train.csv')
# Load test data
test_data = pd.read_csv('https://raw.githubusercontent.com/salarMokhtariL/Facke-News-Detection/main/Dataset/test.csv')
train_data.dropna(inplace=True)
# %% prepare the data
''' This class takes in the data, tokenizes it using the DistilBertTokenizer from the transformers library,
and returns the input IDs, attention masks, and labels.'''
class FakeNewsDataset2(Dataset):
def __init__(self, data, max_len=128):
self.data = data
self.max_len = max_len
self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
def __len__(self):
return len(self.data)
def __getitem__(self, index):
text = self.data.iloc[index]['text']
label = self.data.iloc[index]['label']
inputs = self.tokenizer.encode_plus(
text,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length',
truncation=True,
return_token_type_ids=True,
return_attention_mask=True,
return_tensors='pt',
)
return text, inputs['input_ids'].squeeze(0), inputs['attention_mask'].squeeze(0), torch.tensor(label, dtype=torch.long)
# %%split the data into training and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2,
random_state=42)
# %%
# Create PyTorch data loaders for the training, validation, and test sets:
train_dataset = FakeNewsDataset2(train_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_dataset = FakeNewsDataset2(val_data)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_dataset = FakeNewsDataset2(test_data)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
# %% train gcn
# from torch_geometric.utils import to_dense_adj
from torch_geometric.data import Data, Batch
def create_word_cooccurrence_adjacency(sentences):
words=sentences
words = sorted(words) # not Unique words
word_to_idx = {word: i for i, word in enumerate(words)}
n_words = len(words)
# Initialize adjacency matrix with zeros
adj_matrix = torch.zeros((n_words, n_words), dtype=torch.float)
# Establish connections
for i, word1 in enumerate(words):
for j, word2 in enumerate(words):
if word1[0] == word2[0]:
adj_matrix[i, j] = 1.0 # Set connection
return adj_matrix
def adjacency_to_edge_index(adjacency_matrix):
"""
Converts an adjacency matrix to edge indices.
Args:
adjacency_matrix (torch.Tensor): A 2D tensor representing the adjacency matrix.
Returns:
torch.Tensor: A 2xN tensor representing the edge indices, where N is the number of edges.
"""
row, col = adjacency_matrix.nonzero(as_tuple=False).t()
edge_index = torch.stack([row, col], dim=0)
return edge_index
class GCN(torch.nn.Module):
def __init__(self, in_channels, hidden_channels, out_channels):
super().__init__()
self.conv1 = GCNConv(in_channels, hidden_channels)
self.conv2 = GCNConv(hidden_channels, out_channels)
def forward(self, x, edge_index):
x = self.conv1(x, edge_index)
x = F.relu(x)
x = self.conv2(x, edge_index)
print (f' {x.shape}')
return x
def train_gcn_epoch(model, optimizer, criterion, train_loader):
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.train()
train_loss = 0
train_acc = 0
for text,input_ids, attention_mask, labels in tqdm(train_loader, desc='Training'):
input_ids2=input_ids.to(dtype=torch.float32).to(device).clone()
labels2=labels.to(dtype=torch.float32).to(device).clone()
edge_index = []
edge_index_batch=[]
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
adj_matrix_batch=[]#
for j in range(len(text)):# for a batch of 32(texts) it takes one text at a time
inputs = tokenizer.encode(
text[j],
add_special_tokens=True,
max_length=130,
padding='max_length',
truncation=True)
tokens = tokenizer.convert_ids_to_tokens(inputs)
tokens=tokens[1:129]
print(f'tokens {len(tokens)}')
adj_matrix=create_word_cooccurrence_adjacency(tokens)
print(f'adj {adj_matrix}')
# print(f'input_ids {input_ids2.shape}')
adj_matrix_batch.append(adj_matrix)
edge_index=adjacency_to_edge_index(adj_matrix)
edge_index_batch.append(edge_index)
data_all_list=[]
for i in range(len(text)):
data= Data(x=torch.unsqueeze(input_ids2[i],0), edge_index=edge_index_batch[i],y=labels2[i])
data_all_list.append(data)
batch = Batch.from_data_list(data_all_list)
# print(f'x {batch.x.shape}')
# print(f'edge {batch.edge_index.shape}')
# print(f'label {labels.shape}')
# print(f' num_nodes {batch.num_nodes}')
# print(f' edge index max{batch.edge_index.max()}')
outputs = model(torch.tensor(batch.x).to(device), torch.tensor(batch.edge_index).to(torch.int64).to(device))
loss = criterion(outputs, labels2.to(torch.int64))
loss.backward()
optimizer.step()
train_loss += loss.item()
train_acc += (outputs.argmax(1) == labels.to(device)).sum().item()
train_loss /= len(train_loader)
train_acc /= len(train_loader.dataset)
return train_loss, train_acc
# %% evaluate GCNN
model = GCN(in_channels=128, hidden_channels=64, out_channels=2).to(device)
optimizer = optim.Adam(model.parameters(), lr=2e-5)
criterion = nn.CrossEntropyLoss()
best_val_acc = 0
# Training loop
num_epochs = 25
for epoch in range(num_epochs):
train_loss, train_acc = train_gcn_epoch(model, optimizer, criterion, train_loader)