From ec9c23e4118b2ec834c41592c6b98e2099c54426 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?M=C3=A1ximo=20Cuadros?= Date: Sat, 16 Jul 2016 22:41:51 +0200 Subject: [PATCH] content disambiguate logic --- common.go | 2 +- content.go | 709 +++++++++++++++++++++++++++++++---------------------- 2 files changed, 411 insertions(+), 300 deletions(-) diff --git a/common.go b/common.go index c171f9eb..89f0f5b0 100644 --- a/common.go +++ b/common.go @@ -938,7 +938,7 @@ func init() { var ExtensionsByLanguage map[string][]string -var ignoredExtensions = []string{".asc", ".cgi", ".fcgi", ".gml", ".vhost"} +var ignoredExtensions = []string{".asc", ".cgi", ".fcgi", ".gml", ".fx", ".vhost"} func GetLanguageExtensions(language string) []string { return ExtensionsByLanguage[language] diff --git a/content.go b/content.go index ac5af039..57b2f919 100644 --- a/content.go +++ b/content.go @@ -2,12 +2,14 @@ package slinguist import ( "path/filepath" + "strings" "gopkg.in/toqueteos/substring.v1" ) func GetLanguageByContent(filename string, content []byte) (lang string, safe bool) { - if fnMatcher, ok := matchers[filepath.Ext(filename)]; ok { + ext := strings.ToLower(filepath.Ext(filename)) + if fnMatcher, ok := matchers[ext]; ok { lang, safe = fnMatcher(content) return } @@ -18,146 +20,422 @@ func GetLanguageByContent(filename string, content []byte) (lang string, safe bo type languageMatcher func([]byte) (string, bool) var matchers = map[string]languageMatcher{ - ".bf": bfExtLanguage, - ".b": bExtLanguage, - ".cl": clExtLanguage, - ".inc": incExtLanguage, - ".cls": clsExtLanguage, - ".m": mExtLanguage, - ".ms": msExtLanguage, - ".md": mdExtLanguage, - ".fs": fsExtLanguage, - ".h": hExtLanguage, - ".hh": hhExtLanguage, - ".l": lExtLanguage, - ".n": nExtLanguage, - ".lisp": lispExtLanguage, - ".lsp": lispExtLanguage, - ".pm": pmExtLanguage, - ".t": tExtLanguage, - ".ts": tsExtLanguage, - ".tsx": tsxExtLanguage, - ".rs": rsExtLanguage, - ".pl": plExtLanguage, - ".pro": proExtLanguage, - ".toc": tocExtLanguage, - ".sls": slsExtLanguage, - ".sql": sqlExtLanguage, + ".bf": func(i []byte) (string, bool) { + if substring.BytesRegexp(`(fprintf|function|return)`).Match(i) { + return "HyPhy", true + } + + return "Brainfuck", false + }, + ".b": func(i []byte) (string, bool) { + if substring.BytesRegexp(`(include|modules)`).Match(i) { + return "Limbo", true + } + + return "Brainfuck", false + }, + ".bb": func(i []byte) (string, bool) { + if blitzBasicMatcher.Match(i) { + return "BlitzBasic", true + } else if substring.BytesRegexp(`^\s*(# |include|require)\b`).Match(i) { + return "BitBake", true + } + + return OtherLanguage, false + }, + ".cl": func(i []byte) (string, bool) { + if commonLispMatcher.Match(i) { + return "Common Lisp", true + } else if coolMatcher.Match(i) { + return "Cool", true + } else if openCLMatcher.Match(i) { + return "OpenCL", true + } + + return OtherLanguage, false + }, + ".cls": func(i []byte) (string, bool) { + if apexMatcher.Match(i) { + return "Apex", true + } else if openEdgeABLMatcher.Match(i) { + return "OpenEdge ABL", true + } else if texMatcher.Match(i) { + return "TeX", true + } else if visualBasicMatcher.Match(i) { + return "Visual Basic", true + + } + + return OtherLanguage, false + }, + ".cs": func(i []byte) (string, bool) { + if substring.BytesRegexp(`![\w\s]+methodsFor: `).Match(i) { + return "Smalltalk", true + } + + return "C#", true + }, + ".ch": func(i []byte) (string, bool) { + if substring.BytesRegexp(`(?i)^\s*#\s*(if|ifdef|ifndef|define|command|xcommand|translate|xtranslate|include|pragma|undef)\b`).Match(i) { + return "xBase", true + } + + return "Charity", true + }, + ".d": func(i []byte) (string, bool) { + if substring.BytesRegexp(`^module `).Match(i) { + return "D", true + } else if substring.BytesRegexp(`((dtrace:::)?BEGIN|provider |#pragma (D (option|attributes)|ident)\s)`).Match(i) { + return "DTrace", true + } else if substring.BytesRegexp(`(\/.*:( .* \\)$| : \\$|^ : |: \\$)`).Match(i) { + return "Makefile", true + } + + return OtherLanguage, true + }, + ".ecl": func(i []byte) (string, bool) { + if substring.BytesRegexp(`^[^#]+:-`).Match(i) { + return "ECLiPSe", true + } else if substring.BytesHas(`:=`).Match(i) { + return "ECL", true + } + + return OtherLanguage, true + }, + ".es": func(i []byte) (string, bool) { + if substring.BytesRegexp(`\s*(?:%%|main\s*\(.*?\)\s*->)`).Match(i) { + return "Erlang", true + } else if substring.BytesRegexp(`(?:\/\/|("|')use strict\\1|export\s+default\s|\/\*.*?\*\/)`).Match(i) { + return "JavaScript", true + } + + return OtherLanguage, true + }, + ".f": func(i []byte) (string, bool) { + if substring.BytesRegexp(`\n: `).Match(i) { + return "Forth", true + } else if substring.BytesRegexp(`(?i)^([c*][^abd-z]| (subroutine|program|end)\s|\s*!)`).Match(i) { + return "FORTRAN", true + } + + return OtherLanguage, false + }, + ".fr": func(i []byte) (string, bool) { + if substring.BytesRegexp(`^(: |also |new-device|previous )`).Match(i) { + return "Forth", true + } else if substring.BytesRegexp(`\s*(import|module|package|data|type)`).Match(i) { + return "Frege", true + } + + return "Text", false + }, + ".j": func(i []byte) (string, bool) { + if objectiveCMatcher.Match(i) { + return "Objective-J", true + } + + return "Jasmin", false + }, + ".inc": func(i []byte) (string, bool) { + if substring.BytesRegexp(`^<\?(?:php)?`).Match(i) { + return "PHP", true + } + + return OtherLanguage, true + }, + ".m": func(i []byte) (string, bool) { + if objectiveCMatcher.Match(i) { + return "Objective-C", true + } else if substring.BytesHas(`:- module`).Match(i) { + return "Mercury", true + } else if substring.BytesRegexp(`\n: `).Match(i) { + return "MUF", true + } else if substring.BytesRegexp(`\n\s*;`).Match(i) { + return "M", true + } else if substring.BytesRegexp(`\n\s*\(\*`).Match(i) { + return "Mathematica", true + } else if substring.BytesRegexp(`\n\s*%`).Match(i) { + return "Matlab", true + } else if substring.BytesRegexp(`\w+\s*:\s*module\s*{`).Match(i) { + return "Limbo", true + } + + return OtherLanguage, false + }, + ".ms": func(i []byte) (string, bool) { + if substring.BytesRegexp(`[.'][a-z][a-z](\s|$)`).Match(i) { + return "Groff", true + } + + return "MAXScript", true + }, + ".md": func(i []byte) (string, bool) { + if substring.BytesRegexp(`\n[-a-z0-9=#!\*\[|]`).Match(i) { + return "Markdown", true + } else if substring.BytesRegexp(`\n(;;|\(define_)`).Match(i) { + return "GCC Machine Description", true + } + + return OtherLanguage, false + }, + ".moo": func(i []byte) (string, bool) { + if substring.BytesHas(`:- module`).Match(i) { + return "Mercury", true + } + + return "Moocode", false + }, + ".e": func(i []byte) (string, bool) { + if substring.BytesRegexp(`feature\s--`).Match(i) { + return "Eiffel", true + } + + return "E", false + }, + ".fs": func(i []byte) (string, bool) { + if substring.BytesRegexp(`\n(: |new-device)`).Match(i) { + return "Forth", true + } else if substring.BytesRegexp(`\s*(#light|import|let|module|namespace|open|type)`).Match(i) { + return "F#", true + } else if substring.BytesRegexp(`(#version|precision|uniform|varying|vec[234])`).Match(i) { + return "GLSL", true + } else if substring.BytesRegexp(`#include|#pragma\s+(rs|version)|__attribute__`).Match(i) { + return "Filterscript", true + } + + return OtherLanguage, false + }, + ".gs": func(i []byte) (string, bool) { + if substring.BytesRegexp(`uses java\.`).Match(i) { + return "Gosu", true + } + + return "JavaScript", false + }, + ".h": func(i []byte) (string, bool) { + if objectiveCMatcher.Match(i) { + return "Objective-C", true + } else if cPlusPlusMatcher.Match(i) { + return "C++", true + } + + return "C", true + }, + ".hh": func(i []byte) (string, bool) { + if substring.BytesRegexp(`^<\?(?:hh)?`).Match(i) { + return "Hack", true + } else if cPlusPlusMatcher.Match(i) { + return "C++", true + } + + return OtherLanguage, false + }, + ".l": func(i []byte) (string, bool) { + if substring.BytesRegexp(`\(def(un|macro)\s`).Match(i) { + return "Common Lisp", true + } else if substring.BytesRegexp(`(%[%{}]xs|<.*>)`).Match(i) { + return "Lex", true + } else if substring.BytesRegexp(`\.[a-z][a-z](\s|$)`).Match(i) { + return "Groff", true + } else if substring.BytesRegexp(`(de|class|rel|code|data|must)`).Match(i) { + return "PicoLisp", true + } + + return OtherLanguage, false + }, + ".ls": func(i []byte) (string, bool) { + if substring.BytesRegexp(`\s*package\s*[\w\.\/\*\s]*\s*{`).Match(i) { + return "LoomScript", true + } + + return "LiveScript", false + }, + ".n": func(i []byte) (string, bool) { + if substring.BytesRegexp(`^[.']`).Match(i) { + return "Groff", true + } else if substring.BytesRegexp(`(module|namespace|using)`).Match(i) { + return "Nemerle", true + } + + return OtherLanguage, false + }, + ".ncl": func(i []byte) (string, bool) { + if substring.BytesHas("THE_TITLE").Match(i) { + return "Text", true + } + + return "NCL", true + }, + ".mod": func(i []byte) (string, bool) { + if substring.BytesHas("").Match(i) { + return "XML", true + } + + return "TypeScript", true + }, + ".tsx": func(i []byte) (string, bool) { + if substring.BytesHas("").Match(i) { + return "XML", true + } + + return "TypeScript", true + }, + ".tst": func(i []byte) (string, bool) { + if substring.BytesHas("gap> ").Match(i) { + return "GAP", true + } + + return "Scilab", true + }, + ".r": func(i []byte) (string, bool) { + if substring.BytesRegexp(`(?i)\bRebol\b`).Match(i) { + return "Rebol", true + } else if substring.BytesHas("<-").Match(i) { + return "R", true + } + + return OtherLanguage, false + }, + ".rs": func(i []byte) (string, bool) { + if substring.BytesRegexp(`\n(use |fn |mod |pub |macro_rules|impl|#!?\[)`).Match(i) { + return "Rust", true + } else if substring.BytesRegexp(`#include|#pragma\s+(rs|version)|__attribute__`).Match(i) { + return "RenderScript", true + } + + return OtherLanguage, false + }, + ".rpy": func(i []byte) (string, bool) { + if substring.BytesRegexp(`(import|from|class|def)\s`).Match(i) { + return "Python", true + } + + return "Ren'Py", false + }, + ".v": func(i []byte) (string, bool) { + if substring.BytesRegexp(`\nendmodule`).Match(i) { + return "Verilog", true + } else if substring.BytesRegexp(`(Require|Import)`).Match(i) { + return "Coq", true + } + + return OtherLanguage, false + }, + ".pl": func(i []byte) (string, bool) { + if prologMatcher.Match(i) { + return "Prolog", true + } else if perl6Matcher.Match(i) { + return "Perl6", true + } + + return "Perl", false + }, + ".pro": func(i []byte) (string, bool) { + if prologMatcher.Match(i) { + return "Prolog", true + } + + return OtherLanguage, false + }, + ".pod": func(i []byte) (string, bool) { + if substring.BytesRegexp(`=\w+\n`).Match(i) { + return "Pod", true + } + + return "Perl", false + }, + ".toc": func(i []byte) (string, bool) { + if substring.BytesRegexp("## |@no-lib-strip@").Match(i) { + return "World of Warcraft Addon Data", true + } else if substring.BytesRegexp("(contentsline|defcounter|beamer|boolfalse)").Match(i) { + return "TeX", true + } + + return OtherLanguage, false + }, + ".sls": func(i []byte) (string, bool) { + if substring.BytesRegexp("## |@no-lib-strip@").Match(i) { + return "World of Warcraft Addon Data", true + } else if substring.BytesRegexp("(contentsline|defcounter|beamer|boolfalse)").Match(i) { + return "TeX", true + } + + return OtherLanguage, false + }, + ".sql": func(i []byte) (string, bool) { + if pgSQLMatcher.Match(i) { + return "PLpgSQL", true + } else if db2SQLMatcher.Match(i) { + return "SQLPL", true + } else if oracleSQLMatcher.Match(i) { + return "PLSQL", true + } + + return "SQL", false + }, +} + +func init() { + matchers[".for"] = matchers[".f"] + matchers[".lsp"] = matchers[".lisp"] } var ( + blitzBasicMatcher = substring.BytesOr( + substring.BytesHas(`End Function`), + substring.BytesRegexp(`\\s*;`), + ) cPlusPlusMatcher = substring.BytesOr( substring.BytesRegexp(`\s*template\s*<`), substring.BytesRegexp(`\s*#\s*include <(cstdint|string|vector|map|list|array|bitset|queue|stack|forward_list|unordered_map|unordered_set|(i|o|io)stream)>`), substring.BytesRegexp(`\n[ \t]*try`), substring.BytesRegexp(`\n[ \t]*(class|(using[ \t]+)?namespace)\s+\w+`), - substring.BytesRegexp(`\n[ \t]*(private|public|protected):$`), + substring.BytesRegexp(`\n[ \t]*(private|public|protected):\n`), substring.BytesRegexp(`std::\w+`), substring.BytesRegexp(`[ \t]*catch\s*`), ) -) - -func bExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`(include|modules)`).Match(input) { - return "Limbo", true - } - - return "Brainfuck", false -} - -func bfExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`(fprintf|function|return)`).Match(input) { - return "HyPhy", true - } - - return "Brainfuck", false -} - -func incExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`^<\?(?:php)?`).Match(input) { - return "PHP", true - } - - return OtherLanguage, true -} - -func fsExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`\n(: |new-device)`).Match(input) { - return "Forth", true - } else if substring.BytesRegexp(`\s*(#light|import|let|module|namespace|open|type)`).Match(input) { - return "F#", true - } else if substring.BytesRegexp(`(#version|precision|uniform|varying|vec[234])`).Match(input) { - return "GLSL", true - } else if substring.BytesRegexp(`#include|#pragma\s+(rs|version)|__attribute__`).Match(input) { - return "Filterscript", true - } - - return OtherLanguage, true -} - -func hExtLanguage(input []byte) (string, bool) { - if objectiveCMatcher.Match(input) { - return "Objective-C", true - } else if cPlusPlusMatcher.Match(input) { - return "C++", true - } - - return "C", true -} - -func hhExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`^<\?(?:hh)?`).Match(input) { - return "Hack", true - } else if cPlusPlusMatcher.Match(input) { - return "C++", true - } - - return OtherLanguage, false -} - -func msExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`[.'][a-z][a-z](\s|$)`).Match(input) { - return "Groff", true - } - - return "MAXScript", true -} - -func nExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`^[.']`).Match(input) { - return "Groff", true - } else if substring.BytesRegexp(`(module|namespace|using)`).Match(input) { - return "Nemerle", true - } - - return OtherLanguage, false -} - -func lExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`\(def(un|macro)\s`).Match(input) { - return "Common Lisp", true - } else if substring.BytesRegexp(`(%[%{}]xs|<.*>)`).Match(input) { - return "Lex", true - } else if substring.BytesRegexp(`\.[a-z][a-z](\s|$)`).Match(input) { - return "Groff", true - } else if substring.BytesRegexp(`(de|class|rel|code|data|must)`).Match(input) { - return "PicoLisp", true - } - - return OtherLanguage, false -} - -func lispExtLanguage(input []byte) (string, bool) { - if commonLispMatcher.Match(input) { - return "Common Lisp", true - } else if substring.BytesRegexp(`\s*\(define `).Match(input) { - return "NewLisp", true - } - - return OtherLanguage, false -} - -var ( commonLispMatcher = substring.BytesRegexp("(?i)(defpackage|defun|in-package)") coolMatcher = substring.BytesRegexp("(?i)class") openCLMatcher = substring.BytesOr( @@ -166,21 +444,6 @@ var ( substring.BytesHas(`/*`), substring.BytesHas(`//`), ) -) - -func clExtLanguage(input []byte) (string, bool) { - if commonLispMatcher.Match(input) { - return "Common Lisp", true - } else if coolMatcher.Match(input) { - return "Cool", true - } else if openCLMatcher.Match(input) { - return "OpenCL", true - } - - return OtherLanguage, false -} - -var ( apexMatcher = substring.BytesOr( substring.BytesHas("{\n"), substring.BytesHas("}\n"), @@ -194,139 +457,15 @@ var ( substring.BytesHas("'*"), substring.BytesRegexp(`(?i)(attribute|option|sub|private|protected|public|friend)\b`), ) -) - -func clsExtLanguage(input []byte) (string, bool) { - if texMatcher.Match(input) { - return "TeX", true - } else if visualBasicMatcher.Match(input) { - return "Visual Basic", true - } else if openEdgeABLMatcher.Match(input) { - return "OpenEdge ABL", true - } else if apexMatcher.Match(input) { - return "Apex", true - } - - return OtherLanguage, false -} - -var ( - mathematicaMatcher = substring.BytesHas(`\n\s*\(\*`) + mathematicaMatcher = substring.BytesHas(`^\s*\(\*`) matlabMatcher = substring.BytesRegexp(`\b(function\s*[\[a-zA-Z]+|pcolor|classdef|figure|end|elseif)\b`) objectiveCMatcher = substring.BytesRegexp( `@(interface|class|protocol|property|end|synchronised|selector|implementation)\b|#import\s+.+\.h[">]`) -) - -func mExtLanguage(input []byte) (string, bool) { - if objectiveCMatcher.Match(input) { - return "Objective-C", true - } else if substring.BytesHas(`:- module`).Match(input) { - return "Mercury", true - } else if substring.BytesRegexp(`\n: `).Match(input) { - return "MUF", true - } else if substring.BytesRegexp(`^\s*;`).Match(input) { - return "M", true - } else if mathematicaMatcher.Match(input) { - return "Mathematica", true - } else if matlabMatcher.Match(input) { - return "Matlab", true - } else if substring.BytesRegexp(`^\w+\s*:\s*module\s*{`).Match(input) { - return "Matlab", true - } - - return OtherLanguage, false -} - -func mdExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`\n[-a-z0-9=#!\*\[|]`).Match(input) { - return "Markdown", true - } else if substring.BytesRegexp(`\n(;;|\(define_)`).Match(input) { - return "GCC Machine Description", true - } - return OtherLanguage, false -} - -var ( prologMatcher = substring.BytesRegexp(`^[^#]+:-`) perlMatcher = substring.BytesRegexp(`use strict|use\s+v?5\.`) perl6Matcher = substring.BytesRegexp(`(use v6|(my )?class|module)`) -) - -func plExtLanguage(input []byte) (string, bool) { - if prologMatcher.Match(input) { - return "Prolog", true - } else if perl6Matcher.Match(input) { - return "Perl6", true - } - - return "Perl", false -} - -func pmExtLanguage(input []byte) (string, bool) { - if perlMatcher.Match(input) { - return "Perl", true - } else if perl6Matcher.Match(input) { - return "Perl6", true - } - - return "Perl", false -} - -func tExtLanguage(input []byte) (string, bool) { - if perlMatcher.Match(input) { - return "Perl", true - } else if perl6Matcher.Match(input) { - return "Perl6", true - } else if substring.BytesRegexp(`^\s*%|^\s*var\s+\w+\s*:\s*\w+`).Match(input) { - return "RenderScript", true - } else if substring.BytesRegexp(`^\s*use\s+v6\s*;`).Match(input) { - return "RenderScript", true - } - - return "Perl", false -} - -func rsExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp(`(use |fn |mod |pub |macro_rules|impl|#!?\[)`).Match(input) { - return "Rust", true - } else if substring.BytesRegexp(`#include|#pragma\s+(rs|version)|__attribute__`).Match(input) { - return "RenderScript", true - } - - return OtherLanguage, false -} - -func proExtLanguage(input []byte) (string, bool) { - if prologMatcher.Match(input) { - return "Prolog", true - } - - return OtherLanguage, false -} - -func tocExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp("## |@no-lib-strip@").Match(input) { - return "World of Warcraft Addon Data", true - } else if substring.BytesRegexp("(contentsline|defcounter|beamer|boolfalse)").Match(input) { - return "TeX", true - } - - return OtherLanguage, false -} - -func slsExtLanguage(input []byte) (string, bool) { - if substring.BytesRegexp("## |@no-lib-strip@").Match(input) { - return "World of Warcraft Addon Data", true - } else if substring.BytesRegexp("(contentsline|defcounter|beamer|boolfalse)").Match(input) { - return "TeX", true - } - - return OtherLanguage, false -} - -var ( - pgSQLMatcher = substring.BytesOr( + pgSQLMatcher = substring.BytesOr( substring.BytesRegexp(`(?i)\\i\b|AS \$\$|LANGUAGE '?plpgsql'?`), substring.BytesRegexp(`(?i)SECURITY (DEFINER|INVOKER)`), substring.BytesRegexp(`BEGIN( WORK| TRANSACTION)?;`), @@ -340,31 +479,3 @@ var ( substring.BytesRegexp(`(?i)constructor\W+function`), ) ) - -func sqlExtLanguage(input []byte) (string, bool) { - if pgSQLMatcher.Match(input) { - return "PLpgSQL", true - } else if db2SQLMatcher.Match(input) { - return "SQLPL", true - } else if oracleSQLMatcher.Match(input) { - return "PLSQL", true - } - - return "SQL", false -} - -func tsExtLanguage(input []byte) (string, bool) { - if substring.BytesHas("").Match(input) { - return "XML", true - } - - return "TypeScript", true -} - -func tsxExtLanguage(input []byte) (string, bool) { - if substring.BytesHas("").Match(input) { - return "XML", true - } - - return "TypeScript", true -}