Skip to content

Co-occurrence

kenon.cooccurrence.build_cooccurrence_graph(tokens, window=2, stopwords=None, min_weight=0.0)

Build a weighted co-occurrence graph using skip-gram windows.

Each node is a token. Each edge weight is the relative co-occurrence frequency of the two tokens within the specified window.

Parameters:

Name Type Description Default
tokens list[Token]

Flat list of tokens (already lowercased / lemmatised as desired).

required
window int

Half-width of the skip-gram context window. A window of 2 means each token is paired with the 2 tokens before and 2 after.

2
stopwords frozenset[str] | None

Tokens to exclude from nodes and edges.

None
min_weight float

Drop edges with weight below this threshold.

0.0

Returns:

Type Description
SemanticGraph

A networkx.Graph with weight edge attributes.

Raises:

Type Description
ValueError

If window is less than 1.

Contract
  • No self-loops in the returned graph.
  • All edge weights are positive.
  • Stopword filtering happens before counting.
Example

tokens = ["cat", "sat", "mat", "cat", "mat"] g = build_cooccurrence_graph(tokens, window=1) g.has_node("cat") True g["cat"]["sat"]["weight"] > 0 True

Source code in kenon/cooccurrence.py
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
def build_cooccurrence_graph(
    tokens: list[Token],
    window: int = 2,
    stopwords: frozenset[str] | None = None,
    min_weight: float = 0.0,
) -> SemanticGraph:
    """Build a weighted co-occurrence graph using skip-gram windows.

    Each node is a token. Each edge weight is the relative co-occurrence
    frequency of the two tokens within the specified window.

    Args:
        tokens: Flat list of tokens (already lowercased / lemmatised as desired).
        window: Half-width of the skip-gram context window. A window of 2 means
            each token is paired with the 2 tokens before and 2 after.
        stopwords: Tokens to exclude from nodes and edges.
        min_weight: Drop edges with weight below this threshold.

    Returns:
        A ``networkx.Graph`` with ``weight`` edge attributes.

    Raises:
        ValueError: If ``window`` is less than 1.

    Contract:
        - No self-loops in the returned graph.
        - All edge weights are positive.
        - Stopword filtering happens before counting.

    Example:
        >>> tokens = ["cat", "sat", "mat", "cat", "mat"]
        >>> g = build_cooccurrence_graph(tokens, window=1)
        >>> g.has_node("cat")
        True
        >>> g["cat"]["sat"]["weight"] > 0
        True
    """
    if window < 1:
        msg = f"window must be >= 1, got {window}"
        raise ValueError(msg)

    if stopwords:
        tokens = [t for t in tokens if t not in stopwords]

    pair_counts: Counter[tuple[str, str]] = Counter()
    total_pairs = 0

    for i, token_a in enumerate(tokens):
        start = max(0, i - window)
        end = min(len(tokens), i + window + 1)
        for j in range(start, end):
            if i == j:
                continue
            token_b = tokens[j]
            if token_a == token_b:
                continue
            pair = (min(token_a, token_b), max(token_a, token_b))
            pair_counts[pair] += 1
            total_pairs += 1

    graph: SemanticGraph = nx.Graph()

    if total_pairs == 0:
        return graph

    for (a, b), count in pair_counts.items():
        weight = count / total_pairs
        if weight >= min_weight:
            graph.add_edge(a, b, weight=weight)

    return graph

kenon.cooccurrence.detect_collocations(tokens, n=2, metric='pmi', top_n=20, min_freq=2)

Detect statistically significant n-grams using NLTK collocation finders.

Parameters:

Name Type Description Default
tokens list[Token]

Flat token list.

required
n int

N-gram size. Supports 2 (bigrams) and 3 (trigrams).

2
metric str

Scoring metric. One of "pmi", "chi_sq", "likelihood".

'pmi'
top_n int

Number of top collocations to return.

20
min_freq int

Minimum frequency filter applied before scoring.

2

Returns:

Type Description
list[tuple[str, ...]]

List of token tuples sorted by score descending.

Raises:

Type Description
ValueError

If n is not 2 or 3.

ValueError

If metric is not one of the supported values.

Contract
  • Returns at most top_n tuples.
  • Each tuple has length n.
Example

tokens = ["new", "york", "city", "new", "york", "times"] * 10 colls = detect_collocations(tokens, n=2, top_n=5) ("new", "york") in colls True

Source code in kenon/cooccurrence.py
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
def detect_collocations(
    tokens: list[Token],
    n: int = 2,
    metric: str = "pmi",
    top_n: int = 20,
    min_freq: int = 2,
) -> list[tuple[str, ...]]:
    """Detect statistically significant n-grams using NLTK collocation finders.

    Args:
        tokens: Flat token list.
        n: N-gram size. Supports 2 (bigrams) and 3 (trigrams).
        metric: Scoring metric. One of ``"pmi"``, ``"chi_sq"``, ``"likelihood"``.
        top_n: Number of top collocations to return.
        min_freq: Minimum frequency filter applied before scoring.

    Returns:
        List of token tuples sorted by score descending.

    Raises:
        ValueError: If ``n`` is not 2 or 3.
        ValueError: If ``metric`` is not one of the supported values.

    Contract:
        - Returns at most ``top_n`` tuples.
        - Each tuple has length ``n``.

    Example:
        >>> tokens = ["new", "york", "city", "new", "york", "times"] * 10
        >>> colls = detect_collocations(tokens, n=2, top_n=5)
        >>> ("new", "york") in colls
        True
    """
    metric_map_bigram = {
        "pmi": BigramAssocMeasures.pmi,
        "chi_sq": BigramAssocMeasures.chi_sq,
        "likelihood": BigramAssocMeasures.likelihood_ratio,
    }
    metric_map_trigram = {
        "pmi": TrigramAssocMeasures.pmi,
        "chi_sq": TrigramAssocMeasures.chi_sq,
        "likelihood": TrigramAssocMeasures.likelihood_ratio,
    }

    if metric not in metric_map_bigram:
        msg = f"Unsupported metric '{metric}'. Supported: 'pmi', 'chi_sq', 'likelihood'"
        raise ValueError(msg)

    if n == 2:
        finder = BigramCollocationFinder.from_words(tokens)
        finder.apply_freq_filter(min_freq)
        return finder.nbest(metric_map_bigram[metric], top_n)

    if n == 3:
        finder = TrigramCollocationFinder.from_words(tokens)
        finder.apply_freq_filter(min_freq)
        return finder.nbest(metric_map_trigram[metric], top_n)

    msg = f"n must be 2 or 3, got {n}"
    raise ValueError(msg)