LSTM Project: Sentiment Analysis

In this project, you will build an LSTM‑based sentiment classifier for IMDB movie reviews. You will learn to load text data, create word embeddings, and train a recurrent model.

Project: Sentiment analysis (positive/negative) on IMDB reviews using LSTM.

Step 1: Load and Preprocess Data

import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

train_iter, test_iter = IMDB(split=('train', 'test'))
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for label, line in data_iter:
        yield tokenizer(line)

vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["", ""])
vocab.set_default_index(vocab[""])

Step 2: Define LSTM Model

class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab[''])
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, (hidden, cell) = self.lstm(embedded)
        hidden = self.dropout(hidden[-1])
        return self.fc(hidden)

Step 3: Training Loop

model = LSTMClassifier(len(vocab), 100, 256, 1, 2, 0.5)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_epoch(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        text, label = batch.text, batch.label
        optimizer.zero_grad()
        predictions = model(text).squeeze(1)
        loss = criterion(predictions, label.float())
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

Step 4: Evaluate

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    with torch.no_grad():
        for batch in iterator:
            text, label = batch.text, batch.label
            predictions = model(text).squeeze(1)
            loss = criterion(predictions, label.float())
            epoch_loss += loss.item()
    return epoch_loss / len(iterator)

Two Minute Drill

Use LSTM with word embeddings for text classification.
Build vocabulary from IMDB dataset.
Train with binary cross‑entropy loss.
Expected accuracy ~85‑88%.

Need more clarification?

Drop us an email at career@quipoinfotech.com

Welcome to Quipoin

Quipoin Menu

LSTM Project: Sentiment Analysis

Need more clarification?