LSTM Project: Sentiment Analysis
In this project, you will build an LSTM‑based sentiment classifier for IMDB movie reviews. You will learn to load text data, create word embeddings, and train a recurrent model.
Project: Sentiment analysis (positive/negative) on IMDB reviews using LSTM.
Step 1: Load and Preprocess Data
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
train_iter, test_iter = IMDB(split=('train', 'test'))
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data_iter):
for label, line in data_iter:
yield tokenizer(line)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["", ""])
vocab.set_default_index(vocab[""]) Step 2: Define LSTM Model
class LSTMClassifier(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
super().__init__()
self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab[''])
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.dropout(self.embedding(text))
output, (hidden, cell) = self.lstm(embedded)
hidden = self.dropout(hidden[-1])
return self.fc(hidden) Step 3: Training Loop
model = LSTMClassifier(len(vocab), 100, 256, 1, 2, 0.5)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
def train_epoch(model, iterator, optimizer, criterion):
model.train()
epoch_loss = 0
for batch in iterator:
text, label = batch.text, batch.label
optimizer.zero_grad()
predictions = model(text).squeeze(1)
loss = criterion(predictions, label.float())
loss.backward()
optimizer.step()
epoch_loss += loss.item()
return epoch_loss / len(iterator)Step 4: Evaluate
def evaluate(model, iterator, criterion):
model.eval()
epoch_loss = 0
with torch.no_grad():
for batch in iterator:
text, label = batch.text, batch.label
predictions = model(text).squeeze(1)
loss = criterion(predictions, label.float())
epoch_loss += loss.item()
return epoch_loss / len(iterator)Two Minute Drill
- Use LSTM with word embeddings for text classification.
- Build vocabulary from IMDB dataset.
- Train with binary cross‑entropy loss.
- Expected accuracy ~85‑88%.
Need more clarification?
Drop us an email at career@quipoinfotech.com
