Compute the Pointwise Mutual Information (PMI) between pairs of words in a corpus. PMI measures how much more (or less) likely two words are to co-occur compared to what we would expect if they were independent.
import math
from collections import Counter, defaultdict
def pointwise_mutual_information(corpus: list[list[str]], word1: str, word2: str, window_size: int = None) -> float:
total_words = sum(len(doc) for doc in corpus)
word_counts = Counter()
co_occur_count = 0
for doc in corpus:
word_counts.update(doc)
if window_size is None:
# Document-level co-occurrence
words_in_doc = set(doc)
if word1 in words_in_doc and word2 in words_in_doc:
co_occur_count += 1
else:
# Window-based co-occurrence
for i, w in enumerate(doc):
if w == word1:
start = max(0, i - window_size)
end = min(len(doc), i + window_size + 1)
if word2 in doc[start:end]:
co_occur_count += 1
p_word1 = word_counts[word1] / total_words
p_word2 = word_counts[word2] / total_words
if window_size is None:
n_contexts = len(corpus)
p_co_occur = co_occur_count / n_contexts
else:
p_co_occur = co_occur_count / total_words
if p_co_occur == 0 or p_word1 == 0 or p_word2 == 0:
return 0.0
pmi = math.log2(p_co_occur / (p_word1 * p_word2))
return round(pmi, 4)P(word).PMI(x, y) = log2(P(x, y) / (P(x) * P(y))).