Skip to content

Tokenizer

kenon.tokenizer.Tokenizer

Sentence and word tokeniser backed by spaCy.

Supports optional lemmatisation for any language that has a spaCy model. Long texts are automatically split into chunks before processing so that book-length inputs work without running out of memory.

Parameters:

Name Type Description Default
lang str

spaCy model name, e.g. "en_core_web_sm" or "de_core_news_sm".

'en_core_web_sm'
lemmatize bool

If True, return lemmas instead of surface forms.

False
lower bool

If True, lowercase all tokens.

True
Contract
  • lang must be a valid spaCy model name installed on the system.
  • Raises RuntimeError if the model is not installed.
  • All methods accept str inputs only, never file paths.
  • Pure whitespace and punctuation tokens are excluded by default.
Example

t = Tokenizer("en_core_web_sm") sents = t.sentencize("The cat sat. The dog ran.") len(sents) 2

Source code in kenon/tokenizer.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
class Tokenizer:
    """Sentence and word tokeniser backed by spaCy.

    Supports optional lemmatisation for any language that has a spaCy model.
    Long texts are automatically split into chunks before processing so that
    book-length inputs work without running out of memory.

    Args:
        lang: spaCy model name, e.g. ``"en_core_web_sm"`` or ``"de_core_news_sm"``.
        lemmatize: If True, return lemmas instead of surface forms.
        lower: If True, lowercase all tokens.

    Contract:
        - ``lang`` must be a valid spaCy model name installed on the system.
        - Raises ``RuntimeError`` if the model is not installed.
        - All methods accept ``str`` inputs only, never file paths.
        - Pure whitespace and punctuation tokens are excluded by default.

    Example:
        >>> t = Tokenizer("en_core_web_sm")
        >>> sents = t.sentencize("The cat sat. The dog ran.")
        >>> len(sents)
        2
    """

    def __init__(
        self,
        lang: str = "en_core_web_sm",
        lemmatize: bool = False,
        lower: bool = True,
    ) -> None:
        self._lang = lang
        self._lemmatize = lemmatize
        self._lower = lower
        self._nlp: Language | None = None

    def _load(self) -> Language:
        """Lazily load the spaCy model on first use.

        The heavy parser and NER components are replaced by a lightweight
        rule-based sentencizer, which keeps memory usage low even for
        book-length texts while still providing accurate tokenization
        and lemmatization.
        """
        if self._nlp is None:
            try:
                self._nlp = spacy.load(
                    self._lang,
                    disable=["parser", "ner"],
                )
            except OSError as exc:
                msg = (
                    f"spaCy model '{self._lang}' is not installed.\n"
                    f"Run: python -m spacy download {self._lang}\n"
                    f"Available models: https://spacy.io/models"
                )
                raise RuntimeError(msg) from exc
            # Add rule-based sentencizer to replace the disabled parser
            if "sentencizer" not in self._nlp.pipe_names:
                self._nlp.add_pipe("sentencizer")
        return self._nlp

    def _chunk_text(self, text: str) -> list[str]:
        """Split text into chunks that are safe for spaCy to process.

        Splits on paragraph boundaries (double newlines) to avoid breaking
        mid-sentence. Falls back to single newlines, then whitespace.
        """
        if len(text) <= _CHUNK_TARGET:
            return [text]

        chunks: list[str] = []
        remaining = text
        while remaining:
            if len(remaining) <= _CHUNK_TARGET:
                chunks.append(remaining)
                break

            # Find a paragraph break near the target size
            split_at = -1
            for sep in ["\n\n", "\n", " "]:
                pos = remaining.rfind(sep, 0, _CHUNK_TARGET)
                if pos > _CHUNK_TARGET // 2:  # don't split too early
                    split_at = pos + len(sep)
                    break

            if split_at <= 0:
                # No good split point found — hard split at target
                split_at = _CHUNK_TARGET

            chunks.append(remaining[:split_at])
            remaining = remaining[split_at:]

        return chunks

    def _process(self, text: str) -> list[spacy.tokens.Doc]:
        """Process text through spaCy, chunking if necessary.

        Chunks are processed one at a time (not batched) so that each
        spaCy Doc can be garbage-collected before the next is created,
        keeping peak memory low for book-length texts.
        """
        nlp = self._load()
        chunks = self._chunk_text(text)
        nlp.max_length = max(nlp.max_length, max(len(c) for c in chunks) + 100)
        return [nlp(chunk) for chunk in chunks]

    def _token_text(self, token: spacy.tokens.Token) -> str:  # type: ignore[name-defined]
        """Extract text from a spaCy token, applying lemmatisation and lowering."""
        text = token.lemma_ if self._lemmatize else token.text
        if self._lower:
            text = text.lower()
        return text

    def sentencize(self, text: str) -> list[str]:
        """Split text into sentence strings.

        Args:
            text: Input text to split into sentences.

        Returns:
            List of sentence strings.

        Contract:
            - Never returns empty strings in the output list.
            - Sentence boundaries are determined by spaCy's sentence segmenter.

        Example:
            >>> t = Tokenizer("en_core_web_sm")
            >>> sents = t.sentencize("Hello world. Goodbye world.")
            >>> len(sents) == 2
            True
        """
        docs = self._process(text)
        result: list[str] = []
        for doc in docs:
            for sent in doc.sents:
                stripped = sent.text.strip()
                if stripped:
                    result.append(stripped)
        return result

    def tokenize(self, text: str, *, keep_punct: bool = False) -> Document:
        """Split text into a nested list: sentences -> tokens.

        Args:
            text: Input text to tokenize.
            keep_punct: If True, keep punctuation tokens.

        Returns:
            A list of sentences, each a list of token strings.

        Contract:
            - Whitespace-only tokens are always excluded.
            - Punctuation tokens excluded unless ``keep_punct=True``.

        Example:
            >>> t = Tokenizer("en_core_web_sm", lemmatize=True)
            >>> doc = t.tokenize("The cats were running.")
            >>> "cat" in doc[0]
            True
            >>> "run" in doc[0]
            True
        """
        docs = self._process(text)
        result: Document = []
        for doc in docs:
            for sent in doc.sents:
                tokens: list[Token] = []
                for token in sent:
                    if token.is_space:
                        continue
                    if not keep_punct and token.is_punct:
                        continue
                    tokens.append(self._token_text(token))
                if tokens:
                    result.append(tokens)
        return result

    def flat_tokens(self, text: str, *, keep_punct: bool = False) -> list[Token]:
        """Return all tokens in a single flat list (no sentence structure).

        Args:
            text: Input text to tokenize.
            keep_punct: If True, keep punctuation tokens.

        Returns:
            Flat list of token strings.

        Contract:
            - Equivalent to flattening the result of ``tokenize()``.
            - All returned tokens are substrings of the original text
              (possibly lowercased or lemmatised).

        Example:
            >>> t = Tokenizer("en_core_web_sm")
            >>> tokens = t.flat_tokens("The cat sat on the mat.")
            >>> "cat" in tokens
            True
            >>> "." not in tokens
            True
        """
        doc = self.tokenize(text, keep_punct=keep_punct)
        return [token for sent in doc for token in sent]

flat_tokens(text, *, keep_punct=False)

Return all tokens in a single flat list (no sentence structure).

Parameters:

Name Type Description Default
text str

Input text to tokenize.

required
keep_punct bool

If True, keep punctuation tokens.

False

Returns:

Type Description
list[Token]

Flat list of token strings.

Contract
  • Equivalent to flattening the result of tokenize().
  • All returned tokens are substrings of the original text (possibly lowercased or lemmatised).
Example

t = Tokenizer("en_core_web_sm") tokens = t.flat_tokens("The cat sat on the mat.") "cat" in tokens True "." not in tokens True

Source code in kenon/tokenizer.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def flat_tokens(self, text: str, *, keep_punct: bool = False) -> list[Token]:
    """Return all tokens in a single flat list (no sentence structure).

    Args:
        text: Input text to tokenize.
        keep_punct: If True, keep punctuation tokens.

    Returns:
        Flat list of token strings.

    Contract:
        - Equivalent to flattening the result of ``tokenize()``.
        - All returned tokens are substrings of the original text
          (possibly lowercased or lemmatised).

    Example:
        >>> t = Tokenizer("en_core_web_sm")
        >>> tokens = t.flat_tokens("The cat sat on the mat.")
        >>> "cat" in tokens
        True
        >>> "." not in tokens
        True
    """
    doc = self.tokenize(text, keep_punct=keep_punct)
    return [token for sent in doc for token in sent]

sentencize(text)

Split text into sentence strings.

Parameters:

Name Type Description Default
text str

Input text to split into sentences.

required

Returns:

Type Description
list[str]

List of sentence strings.

Contract
  • Never returns empty strings in the output list.
  • Sentence boundaries are determined by spaCy's sentence segmenter.
Example

t = Tokenizer("en_core_web_sm") sents = t.sentencize("Hello world. Goodbye world.") len(sents) == 2 True

Source code in kenon/tokenizer.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
def sentencize(self, text: str) -> list[str]:
    """Split text into sentence strings.

    Args:
        text: Input text to split into sentences.

    Returns:
        List of sentence strings.

    Contract:
        - Never returns empty strings in the output list.
        - Sentence boundaries are determined by spaCy's sentence segmenter.

    Example:
        >>> t = Tokenizer("en_core_web_sm")
        >>> sents = t.sentencize("Hello world. Goodbye world.")
        >>> len(sents) == 2
        True
    """
    docs = self._process(text)
    result: list[str] = []
    for doc in docs:
        for sent in doc.sents:
            stripped = sent.text.strip()
            if stripped:
                result.append(stripped)
    return result

tokenize(text, *, keep_punct=False)

Split text into a nested list: sentences -> tokens.

Parameters:

Name Type Description Default
text str

Input text to tokenize.

required
keep_punct bool

If True, keep punctuation tokens.

False

Returns:

Type Description
Document

A list of sentences, each a list of token strings.

Contract
  • Whitespace-only tokens are always excluded.
  • Punctuation tokens excluded unless keep_punct=True.
Example

t = Tokenizer("en_core_web_sm", lemmatize=True) doc = t.tokenize("The cats were running.") "cat" in doc[0] True "run" in doc[0] True

Source code in kenon/tokenizer.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def tokenize(self, text: str, *, keep_punct: bool = False) -> Document:
    """Split text into a nested list: sentences -> tokens.

    Args:
        text: Input text to tokenize.
        keep_punct: If True, keep punctuation tokens.

    Returns:
        A list of sentences, each a list of token strings.

    Contract:
        - Whitespace-only tokens are always excluded.
        - Punctuation tokens excluded unless ``keep_punct=True``.

    Example:
        >>> t = Tokenizer("en_core_web_sm", lemmatize=True)
        >>> doc = t.tokenize("The cats were running.")
        >>> "cat" in doc[0]
        True
        >>> "run" in doc[0]
        True
    """
    docs = self._process(text)
    result: Document = []
    for doc in docs:
        for sent in doc.sents:
            tokens: list[Token] = []
            for token in sent:
                if token.is_space:
                    continue
                if not keep_punct and token.is_punct:
                    continue
                tokens.append(self._token_text(token))
            if tokens:
                result.append(tokens)
    return result