#60 · NLP · Medium
⊣ Solve on deep-ml.comImplement TF-IDF (Term Frequency-Inverse Document Frequency) from scratch. Given a list of documents (each a string), compute the TF-IDF matrix where each row corresponds to a document and each column corresponds to a unique term in the corpus.
import math
def compute_tfidf(corpus):
docs = [doc.lower().split() for doc in corpus]
vocab = sorted(set(word for doc in docs for word in doc))
word_to_idx = {w: i for i, w in enumerate(vocab)}
n_docs = len(docs)
# Document frequency
df = {}
for word in vocab:
df[word] = sum(1 for doc in docs if word in doc)
tfidf_matrix = []
for doc in docs:
word_count = {}
for word in doc:
word_count[word] = word_count.get(word, 0) + 1
total_terms = len(doc)
row = [0.0] * len(vocab)
for word, count in word_count.items():
tf = count / total_terms
idf = math.log(n_docs / df[word])
row[word_to_idx[word]] = round(tf * idf, 4)
tfidf_matrix.append(row)
return tfidf_matrix