diff --git a/internal/tokenizer/tokenize.go b/internal/tokenizer/tokenize.go index dadbccda..6a721c46 100644 --- a/internal/tokenizer/tokenize.go +++ b/internal/tokenizer/tokenize.go @@ -1,3 +1,6 @@ +// Package tokenizer implements file tokenization used by the enry content +// classifier. This package is an implementation detail of enry and should not +// be imported by other packages. package tokenizer import ( @@ -8,6 +11,9 @@ import ( const byteLimit = 100000 +// Tokenize returns language-agnostic lexical tokens from content. The tokens +// returned should match what the Linguist library returns. At most the first +// 100KB of content are tokenized. func Tokenize(content []byte) []string { if len(content) > byteLimit { content = content[:byteLimit]