diff --git a/src/lice_comb/impl/id_detection.clj b/src/lice_comb/impl/id_detection.clj index 60243cd..e888aee 100644 --- a/src/lice_comb/impl/id_detection.clj +++ b/src/lice_comb/impl/id_detection.clj @@ -181,7 +181,7 @@ (contains? m "lgpl") "LGPL" (contains? m "gpl") "GPL") version-present? (boolean (get-rencgs m ["version"] false)) - version (get-rencgs m ["version"] (if (= variant "LGPL") "2.0" "1.0")) + version (get-rencgs m ["version"] (if (= variant "LGPL") "2.0" "1.0")) ; Note: on the advice of the SPDX technical team, default to earliest version when version not present version (s/replace version #"\p{Punct}+" ".") [confidence confidence-explanations] (if version-present? @@ -195,7 +195,7 @@ [suffix confidence-explanations] (cond (contains? m "orLater") ["or-later" confidence-explanations] (contains? m "only") ["only" confidence-explanations] - :else [(if version-present? "only" "or-later") ; Note: on the advice of SPDX technical team, default to "or later" variant if version not present + :else [(if version-present? "only" "or-later") ; Note: on the advice of SPDX technical team, default to "or later" variant if version suffix not present (set/union #{:missing-version-suffix} confidence-explanations)]) id (str variant "-" version "-" suffix)] [(assert-listed-id id) confidence confidence-explanations])) @@ -307,8 +307,12 @@ :regex #"(?i)(?\d+(\.\d+)?)?)?(?!\w)" :fn (constantly ["UPL-1.0" :high])} ; There are no other listed versions of this license :WTFPL { - :regex #"(?i)(?> matches (med/distinct-by :id) ;####TODO: THINK ABOUT MERGING INSTEAD OF DROPPING (e.g. if the same id is detected in two different places in s, and we want to preserve the two eis) (sort-by :start) - (map #(hash-map (:id %) (merge {:id (:id %) ; We duplicate this here in case the result gets merged into an expression - :type (:type %) - :confidence (:confidence %) - :strategy (:strategy %) - :source (:source %)} - (when (seq (:confidence-explanations %)) - {:confidence-explanations (:confidence-explanations %)}))))))) + (map #(merge {:id (:id %) ; We duplicate this here in case the result gets merged into an expression + :type (:type %) + :confidence (:confidence %) + :strategy (:strategy %) + :source (:source %)} + (when (seq (:confidence-explanations %)) + {:confidence-explanations (:confidence-explanations %)})))))) (defn init! "Initialises this namespace upon first call (and does nothing on subsequent diff --git a/src/lice_comb/impl/parsing.clj b/src/lice_comb/impl/parsing.clj index e19e8a8..1870231 100644 --- a/src/lice_comb/impl/parsing.clj +++ b/src/lice_comb/impl/parsing.clj @@ -20,6 +20,7 @@ [spdx.expressions :as sexp] [spdx.regexes :as sre] [embroidery.api :as e] + [rencg.api :as rencg] [lice-comb.impl.spdx :as lcis] [lice-comb.impl.id-detection :as lciid] [lice-comb.impl.expressions-info :as lciei] @@ -86,6 +87,7 @@ ; We don't need to sexp/normalise the keys here, as we never detect an expression from a URI (lciei/prepend-source uri (lcic/correct result))))) +;####TODO: REMOVE ME!!!! (defn debug-print ([x] (debug-print x nil)) ([x msg] @@ -93,31 +95,6 @@ (flush) x)) -(defn- not-blank-string? - "`true` when `x` is not a blank `String`." - [x] - (or (not (string? x)) - (not (s/blank? x)))) - -(defn- mapcat-pred - "mapcat on `coll`, calling `f` for any/all values for which `pred` returns - logical true, passing through other values unchanged." - [pred f coll] - (when (and pred f coll) - (mapcat #(if (pred %) - (f %) - [%]) - coll))) - -(comment -(defn- determine-strategy-for-id-match - "Returns the strategy (a keyword) for the given `m`atch, matched to `id`." - [match id] - (cond - (= (s/lower-case match) (s/lower-case id)) :spdx-listed-identifier - :else :spdx-listed-identifier-near-match)) -) - (defn- determine-strategy "Returns the strategy (a keyword) for the given `match`, matched to `listed-name`." @@ -128,32 +105,19 @@ (= (s/lower-case match) (s/lower-case listed-name)) :spdx-listed-name-case-insensitive-match :else :spdx-listed-name-near-match)) -(comment -(defn- replace-operators-with-keywords - "Replaces `String`s that represent SPDX expression operators in `coll` with - an equivalent keyword (`:and`, `:or`, `:with`), or nothing if the 'operator' - in question is unidentifiable (e.g. `and/or`, `/`, `\\`). Other values that - are not operators are preserved in `coll` (but trimmed of whitespace)." - [coll] - (filter identity - (map #(if (string? %) - (let [trimmed (s/trim %) - val (-> trimmed - s/lower-case - (s/replace #"(?i)w/" "with") - (s/replace #"&+" "and") - (s/replace #"(?i)and\s*/+\\+\s*or" "/") - (s/replace #"/+" "/") - (s/replace #"\\+" "/"))] - (case val - "and" :and - "or" :or - "with" :with - ("/" "") nil - trimmed)) - %) ; Not an operator - keep it unchanged - coll))) -) +(defn- make-unidentified-ei + "Makes an expression-info map for the given license `n`ame, and (optionally) + unidentified license-ref." + ([n] (make-unidentified-ei n (lcis/name->unidentified-license-ref n))) + ([n unidentified-license-ref] + {:id unidentified-license-ref :type :concluded :confidence :high :strategy :unidentified :source (list n)})) + +(defn- make-unidentified-eis-map + "Makes a (singleton) expressions-info map for the given unidentified license + `n`ame." + [n] + (let [unidentified-id (lcis/name->unidentified-license-ref n)] + {unidentified-id (list (make-unidentified-ei n unidentified-id))})) (defn- collapse-duplicate-operator-keywords "Collapses sequential runs of keywords in `coll`, either to 1 keyword if @@ -169,33 +133,89 @@ [] coll))) +(defn- remove-invalid-operator-combos + "Strip invalid operator combinations from `coll`." + [coll] + (when coll + (->> coll + (drop-while keyword?) + (lci3/rdrop-while keyword?) + collapse-duplicate-operator-keywords + seq))) + (def ^:private operator-re #"(?i)((?and\s*/+\\+\s*or)(?!\w)|(?and)(?!\w)|(?or)(?!-later)(?!\w)|(?with)(?!\w)|(?&+)|(?/+)|(?\\+))") (defn- detect-operators "Detects operators in `String` values in `coll`, replacing them with keywords and normalising invalid combinations." [coll] - (->> (filter not-blank-string? - (mapcat-pred string? - #(lciu/replacing-split % operator-re (fn [m] - (cond - (get m "and") :and - (get m "ampersand") :and - (get m "or") :or - (get m "with") :with - :else nil))) - coll)) - (drop-while keyword?) - (lci3/rdrop-while keyword?) - collapse-duplicate-operator-keywords)) + (remove-invalid-operator-combos + (filter lciu/not-blank-string? + (lciu/mapcat-pred string? + #(lciu/replacing-split % operator-re (fn [m] + (cond + (get m "and") :and + (get m "ampersand") :and + (get m "or") :or + (get m "with") :with + :else nil))) + coll)))) + +(defn- find-ids-in-fragment + "Attempts to find one or more ids in `fragment` (a `String`). + For each fragment returns a sequence of expression-info maps, one for each + detected identifer. If no ids are found in a fragment, returns the fragment in + a sequence." + [fragment] + (let [fragment (s/trim fragment)] + ; 1. Is it a listed id, LicenseRef or AdditionRef? + (if-let [ids (seq (map #(get % "Identifier") (rencg/re-seq-ncg (sre/ids-re) fragment)))] + (map #(hash-map :id (sexp/normalise %) :type :declared :strategy :spdx-listed-identifier :source (list fragment)) ids) + ; 2. Does it contain any SPDX identifiers? + (if-let [ids (lcis/find-ids fragment)] + (map (fn [id] {:id id :TODO "####TODO!"}) ids) ;####TODO: IMPLEMENT THIS!!!! + ; 3. Can we detect other ids in it, using custom regexes? + (if-let [result (lciid/find-ids fragment)] + result + ; 4. Give up and return fragment (in a sequence) + [fragment]))))) + +(defn- find-ids + "Attempt to find ids in the `String`s in `coll`. Other values are passed + through unchanged." + [coll] + (lciu/mapcat-pred string? find-ids-in-fragment coll)) (defn- done-parsing? "Are we done parsing `coll`?" [coll] (every? (complement string?) coll)) -(defn- replace-names - "Detects listed license names in the `String`s in `coll`." +(def ^:private extraneous-fragment-res-d (delay [#"(?i)dual" + #"(?i)(public[\s-\\\/]+)?licen[cs]e" + #"(?i)copyright(\s+\(c\))?(\s+©️)?(\s+©)?" + #"(?i)[\s-,]*version[\s-,]+\d+"])) ; Some listed names leave dangling versions (e.g. "Do What The Fuck You Want To Public License, Version 2") + +(defn- remove-extraneous-fragments + "Removes 'extraneous' fragments (`String`s) from `coll`." + [coll] + (loop [[re & r] @extraneous-fragment-res-d + coll coll] + (if (or (not re) + (done-parsing? coll)) + (filter lciu/not-blank-string? coll) + (let [new-coll (filter lciu/not-blank-string? + (lciu/map-pred string? + #(when-not (or (< (count %) 4) ; Strip anything shorter than 4 characters long + (re-matches re (s/trim %))) ; or that matches one of the extraneous fragment regexes + %) + coll))] + (recur r new-coll))))) + +(defn- replace-spdx-names + "Detects SPDX listed license names in the `String`s in `coll`, returning a + sequence of expression-info maps, one for each detected license name. If no + names are found in a fragment, `nil` will be returned." [coll] (loop [[[re id n] & r] @lcis/name-regex-id-pairs-d ;####TODO: CONSIDER MOVING THAT VAR HERE!!!! coll coll] @@ -203,21 +223,48 @@ (not id) (done-parsing? coll)) ; coll is fully devoid of strings, so we can terminate early coll - (let [new-coll (filter not-blank-string? - (mapcat-pred string? - #(lciu/replacing-split % - re - (fn [m] - (let [strategy (determine-strategy (:match m) id n)] - (merge {:id id - :strategy strategy - :source (list (:match m))} - (case strategy - :spdx-listed-identifier {:type :declared} - {:type :concluded :confidence :high}))))) - coll))] + (let [new-coll (lciu/mapcat-pred string? + #(lciu/replacing-split % + re + (fn [m] + (let [id (str id (when (get m "orLater") "+")) + id (if-let [new-id (sexp/normalise id)] new-id id) ; Note: exception ids won't normalise + strategy (determine-strategy (:match m) id n)] + (merge {:id id + :strategy strategy + :source (list (:match m))} + (case strategy + :spdx-listed-identifier {:type :declared} + {:type :concluded :confidence :high}))))) + coll)] (recur r new-coll))))) +(defn- replace-other-names + "Detects other license names in the `String`s in `coll`, returning a sequence + of expression-info maps, one for each detected license name. If no + names are found in a fragment, `nil` will be returned." + [coll] + (loop [[family & r] [:GNU :CDDL :X11] ;####TODO: consider other families! + coll coll] + (if (or (not family) + (done-parsing? coll)) ; coll is fully devoid of strings, so we can terminate early + coll + (let [new-coll (lciu/mapcat-pred string? + (partial lciid/replace-ids family) + coll)] + (recur r new-coll))))) + +(defn- replace-unidentifieds + "Replace any `String`s in `coll` with an expression-info map containing an + unidentified LicenseRef." + [coll] + (lciu/map-pred string? + #(let [s (lciu/trim-non-word %)] + (if (s/blank? s) + (make-unidentified-ei %) + (make-unidentified-ei s))) + coll)) + (defn- group-expressions "Groups expressions in `coll` into sequences of valid SPDX expressions (albeit in sequence form, rather than `String` form. @@ -245,44 +292,56 @@ and operator keywords. Returns an expressions-info map." [coll] (when (seq coll) - (let [eis (filter map? coll) - expr-elements (map #(if (keyword? %) % (:id %)) coll) -; ####TODO: WHEN THE KEYWORD IS :with ENSURE THE FOLLOWING ELEMENT IS AN EXCEPTION, OR (IF LICENSEREF), CONVERT IT TO AN ADDITIONREF - expressions (map #(sexp/normalise (s/join " " (map name %))) (group-expressions expr-elements)) - ; Now regroup expression-infos with their associated expression(s) - ei-lookup (group-by :id eis) - expr-ei-pairs (mapcat #(let [ids (sexp/extract-ids (sexp/parse %))] - [% (seq (filter identity (conj (vec (mapcat (fn [id] (get ei-lookup id)) ids)) - (when (> (count ids) 1) {:type :concluded :confidence :high :strategy :expression-inference}))))]) - expressions) - result (apply hash-map expr-ei-pairs)] - result))) - +;####TEST!!!! +;(debug-print coll "rebuild-expressions start") + (if (every? string? coll) + nil ; Didn't detect anything, so fall through and mark the entire thing as unidentified + (let [eis (filter map? coll)] + (if (every? lcis/unidentified? (map :id eis)) + nil ; Detected nothing but unidentifieds, so fall through and mark the entire thing as unidentified + (let [; ####TODO: WHEN THE KEYWORD IS :with ENSURE THE FOLLOWING ELEMENT IS AN EXCEPTION, OR (IF LICENSEREF), CONVERT IT TO AN ADDITIONREF + grouped-expressions (group-expressions (map #(lci3/when-pred % map? :id) coll)) + expressions (map #(sexp/normalise (s/join " " (map name %))) grouped-expressions) + ; Now regroup expression-infos with their associated expression(s) + ei-lookup (group-by :id eis) + expr-ei-pairs (mapcat #(let [ids (sexp/extract-ids (sexp/parse %) {:include-or-later? true})] + [% (seq (filter identity (conj (vec (mapcat (fn [id] (get ei-lookup id)) ids)) + (when (> (count ids) 1) {:type :concluded :confidence :high :strategy :expression-inference}))))]) + expressions) + result (apply hash-map expr-ei-pairs)] + result)))))) + +;####TODO: CAN PROBABLY MOVE THIS INTO parse-name ONCE ITS WORKING!!!! (defn- parse-XXXXTODO "Parses the given license `n`ame, returning an an expressions-info map or `nil` if no expressions can be found." [n] (when-let [result (-> [n] + ; Parsing, with short circuiting of steps if we're done (lciu/until-> done-parsing? + replace-other-names ; Replace specific name variations that are highly problematic first ;####TEST!!!! ;(debug-print "0") - replace-names ; Replace names first, as this covers the vast majority of "and", "or", "with" in names cases -;####TEST!!!! -;(debug-print "1") -; replace-trickynames ; Replace other name variations not covered by the standard name regexes -; replace-expressions ; This covers ids - (-> detect-operators ; Split the strings on operators, with confidence that they're truly operators and not part of a name + replace-spdx-names ; Replace SPDX listed names; this covers the vast majority of "and", "or", "with" in names cases ;####TEST!!!! ;(debug-print "2") -; find-ids) -) -; mark-unidentifieds -) +; replace-expressions ; This covers ids ; ####PERHAPS UNECESSARY, GIVEN find-ids DOES THIS ANYWAY??? +;####TEST!!!! +;(debug-print "3") + detect-operators ; Split the strings on operators, with confidence that they're truly operators and not part of a name +;####TEST!!!! +;(debug-print "4") + find-ids) + ; Cleanup + remove-extraneous-fragments + remove-invalid-operator-combos + replace-unidentifieds ;####TEST!!!! -(debug-print "after parse") +;(debug-print "after parse, before rebuild") + ; Rebuild the final expression(s) rebuild-expressions)] ;####TEST!!!! -(debug-print result "after expression rebuild") +;(debug-print result "after rebuild") (lciei/prepend-source n result))) (defn parse-name @@ -293,7 +352,7 @@ (let [n (s/trim n)] ; 1. If it's cursed, return it (if-let [cursed-ids (get @cursed-names-d n)] - (map #(apply hash-map %) cursed-ids) + cursed-ids ; 2. If it's a valid SPDX expression, return the normalised rendition of it (if-let [parse-tree (sexp/parse n)] (let [normalised-expression (sexp/unparse parse-tree)] @@ -307,12 +366,11 @@ (if-let [ids (parse-uri n)] ids ; It was a URL, but we weren't able to resolve it to any ids, so return it as unidentified - {(lcis/name->unidentified-license-ref n) (list {:type :concluded :confidence :low :strategy :unidentified :source (list n)})}) + (make-unidentified-eis-map n)) ; 4. Parse the name (if-let [result (parse-XXXXTODO n)] result - (let [unidentified-id (lcis/name->unidentified-license-ref n)] - {unidentified-id (list {:id unidentified-id :type :concluded :confidence :low :strategy :unidentified :source (list n)})})))))))) + (make-unidentified-eis-map n)))))))) (defn init! "Initialises this namespace upon first call (and does nothing on subsequent @@ -326,267 +384,5 @@ (lciid/init!) (lcihttp/init!) @cursed-names-d + @extraneous-fragment-res-d nil) - - - - - - -(comment - - -;####TODO: CONSIDER MOVING THIS TO lice-comb.impl.id-detection -(defn- attempt-to-match-entire-name - "Attempts to match a single SPDX expression from `s` (a `String`). The - specific steps involve determining whether `s` is: - 1. a 'cursed' name (see resources/lice_comb/names.edn) - 2. an SPDX expression - 3. an SPDX listed identifier (near match) - 4. an SPDX listed name (near match) - 5. an SPDX listed URL (near match) - 6. proprietary/commercial - 7. public domain - - Returns an expressions-info map or `nil` if `s` is not recognised as the name - of a single license." - [s] - (let [s (s/trim s)] - ; 1. Is it cursed? - (if-let [cursed-ids (get @cursed-names-d s)] - (map #(apply hash-map %) cursed-ids) - (if-let [normalised-expression (sexp/normalise s)] - ; 2. s is already an SPDX id / expression - {normalised-expression (list {:type :declared - :strategy (if (= 1 (count (sexp/extract-ids (sexp/parse normalised-expression)))) :spdx-listed-identifier :spdx-expression) - :source (list s)})} - (if-let [id (lcis/best-near-match-id s)] - ; 3. ns is a near match for an SPDX identifier - {id (list {:id id :type :concluded :confidence :high :strategy (determine-strategy-for-id-match s id) :source (list s)})} - (if-let [id (lcis/best-near-match-name s)] - ; 4. ns is an SPDX listed name - {id (list {:id id :type :concluded :confidence :high :strategy (determine-strategy-for-name-match s id) :source (list s)})} - (if (lciu/valid-http-uri? s) - (if-let [ei-map (parse-uri s)] - ; 5.1. s is a URL and it's in the SPDX license or exception list - ei-map - ; 5.2. s is a URL but it's not in the SPDX license or exception list - (let [unidentified-license-ref (lcis/name->unidentified-license-ref s)] - {unidentified-license-ref (list {:id unidentified-license-ref :type :concluded :confidence :high :strategy :unidentified :source (list s)})})) - (if (lciid/match-id :proprietary-commercial s) - ; 6. s is proprietary / commercial - (let [prop-comm-license-ref (lcis/proprietary-commercial)] - {prop-comm-license-ref (list {:id prop-comm-license-ref :type :concluded :confidence :high :strategy :regex-matching :source (list s)})}) - (when (lciid/match-id :public-domain s) - ; 7. s is Public Domain - (let [public-domain-license-ref (lcis/public-domain)] - {public-domain-license-ref (list {:id public-domain-license-ref :type :concluded :confidence :high :strategy :regex-matching :source (list s)})})))))))))) - -(defn- replace-listed-ids-near-match - "Replaces all near matches of SPDX listed identifiers in `s` with their SPDX - ids, returning a tuple where the first element is the new `String`, and the - second element is a sequence of expression info maps or `nil` if no replacements - occurred." - [s] - (if (s/blank? s) - [s nil] - (let [[new-s explanations] (lcis/replace-near-match-ids-with-id (s/trim s)) - ei (when explanations - (map #(let [[found id] %] - {:id id :type :concluded :confidence :high :strategy (determine-strategy-for-id-match found id) :source (list found)}) - explanations))] - [new-s ei]))) - -(defn- replace-listed-names-near-match - "Replaces all near matches of SPDX listed names in `s` with their SPDX ids, - returning a tuple where the first element is the new `String`, and the second - element is a sequence of expression info maps or `nil` if no replacements - occurred." - [s] - (if (s/blank? s) - [s nil] - (let [[new-s explanations] (lcis/replace-near-match-names-with-id (s/trim s)) - ei (when explanations - (map #(let [[found id] %] - {:id id :type :concluded :confidence :high :strategy (determine-strategy-for-name-match found id) :source (list found)}) - explanations))] - [new-s ei]))) - -(defn- replace-tricky-names - "Replaces all tricky names in `s` with their SPDX ids, returning a tuple where - the first element is the (potentially new) `String`, and the second element is - a sequence of expression info maps or `nil` if no replacements occurred." - [s] - (if (s/blank? s) - [s nil] - (let [[new-s gnu-eis] (lciid/replace-ids :GNU s) - [new-s cddl-eis] (lciid/replace-ids :CDDL new-s) - [new-s pc-eis] (lciid/replace-ids :proprietary-commercial new-s)] - ;####TODO: Check all of the families for trickiness (operators in names, negative look-behinds/aheads, etc.) - e.g. X11 - [new-s (seq (concat gnu-eis cddl-eis pc-eis))]))) - - - -(defn- attempt-to-find-ids-in-fragment - "Attempts to find one or more ids in `fragment` (a `String`). - For each fragment returns a sequence of maps, where the key(s) are the - detected identifier(s), and the value(s) are an expression-info map for that - identifer. If no ids are found in a fragment, the identifier for that fragment - will be an Unidentified LicenseRef. - - Other elements (i.e. operator keywords) are passed through unchanged)." - [fragment] - ; 1. Is it a listed id, LicenseRef or AdditionRef? - (if (re-matches (sre/ids-re) fragment) - [{fragment nil}] ; Don't need an expression-info here, since it will already have one from earlier steps in the parsing process - ; 2. Does it contain any SPDX identifiers? - (if-let [result (seq (map (fn [id] {id nil}) (lcis/find-ids fragment)))] ;####TODO: add eis, as in most _BUT NOT ALL_, cases the ids will already have an expression-info from earlier steps in the parsing process - result - ; 3. Can we detect other ids in it, using custom regexes? - (if-let [result (lciid/find-ids fragment)] - result - ; 4. Give up and use the unidentified LicenseRef - (let [unidentified-license-ref (lcis/name->unidentified-license-ref fragment)] - [{unidentified-license-ref {:id unidentified-license-ref :type :concluded :confidence :low :strategy :unidentified :source (list fragment)}}]))))) - -(defn- attempt-to-find-ids-in-fragments - "Attempts to find one or more ids in the fragments (`String`s) in `coll`. - For each fragment returns a sequence of maps, where the key(s) are the - detected identifier(s), and the value(s) are an expression-info map for that - identifer. If no ids are found in a fragment, the identifier for that fragment - will be an Unidentified LicenseRef. - - Other elements (i.e. operator keywords) are passed through unchanged)." - [coll] - ; This seemingly-redundant let block is only here to facilitate debugging - (let [result (flatten (map #(if (string? %) (attempt-to-find-ids-in-fragment %) %) coll))] -;####TEST!!!! -;(println "⭐️⭐️⭐️ attempt-to-find-ids-in-fragments result:" (pr-str result)) - result)) - -(defn- group-expressions - "Groups expressions in `coll` into sequences of valid SPDX expressions (albeit - in sequence form, rather than `String` form. - - For example: - [\"Apache-2.0\" \"MIT\"] -> [[\"Apache-2.0\"] [\"MIT\"]] - [\"Apache-2.0\" :or \"MIT\"] -> [[\"Apache-2.0\" :or \"MIT\"]] - [\"Apache-2.0\" :and \"MIT\" \"GPL-2.0-or-later\"] -> [[\"Apache-2.0\" :and \"MIT\"] [\"GPL-2.0-or-later\"]]" - [coll] - (loop [result [[]] - [f & r] coll] - (if-not f - ; Base case - result - ; Recursive case - (let [l (last result)] - (case [(string? (last l)) (string? f)] - [true true] (recur (conj result [f]) r) ; String/string, so start a new nested sequence in result - ([true false] [false true]) (recur (conj (vec (drop-last result)) (conj l f)) r) ; String/keyword or keyword/string, so continue the current last collection in result -; [false false] ; Not possible - we've already removed leading and consecutive keywords in fragments (in remove-invalid-operator-keywords) - ))))) - -(defn- rebuild-expressions - "Rebuilds one or more SPDX expressions from the given `expr` and - expression-infos (`eis`). `expr` is a heterogeneous sequence containing - maps and/or keywords. Each map represents a detected license, with an SPDX - identifier as the key and an exression-info map as the value. Each keyword - represents one of the SPDX expression operators (`:and`, `:or`, `:with`). - - Returns an expressions-info map." - [expr existing-eis] - (let [eis (concat existing-eis (filter identity (mapcat #(when (map? %) (vals %)) expr))) - expr-elements (mapcat #(if (keyword? %) [%] (keys %)) expr) - ; ####TODO: WHEN THE KEYWORD IS :with ENSURE THE FOLLOWING ELEMENT IS AN EXCEPTION, OR (IF LICENSEREF), CONVERT IT TO AN ADDITIONREF - expressions (map #(sexp/normalise (s/join " " (map name %))) (group-expressions expr-elements)) - ; Now regroup expression-infos with their associated expression(s) - ei-lookup (group-by :id eis) - expr-ei-pairs (mapcat #(let [ids (sexp/extract-ids (sexp/parse %))] - [% (seq (filter identity (conj (vec (mapcat (fn [id] (get ei-lookup id)) ids)) - (when (> (count ids) 1) {:type :concluded :confidence :high :strategy :expression-inference}))))]) - expressions) - result (apply hash-map expr-ei-pairs)] - result)) - - - -(defn- split-and-detect-fragments - "Splits `s` (a `String`) into fragments based on probable separators (SPDX - expression operators and various other delimiters commonly seen in license - names), then detects the license and/or exception identifier(s) in each - fragment, the finally rebuilds expression(s). Returns an expressions-info map - or `nil` if `s` is blank." - [s eis] -;####TEST!!!! -;(println "⭐️⭐️⭐️ attempt-to-parse-name - input:" (pr-str [s eis])) - (when-not (s/blank? s) - (let [expr (some-> (lciu/retained-split s #"(?i)((?> expr -; (filter #(or (keyword? %) (not (lcis/unidentified? (first (keys %)))))) - ; Get rid of any dangling keywords after filtering unidentifieds - (drop-while keyword?) - (lci3/rdrop-while keyword?)) - result (rebuild-expressions new-expr eis)] -;####TEST!!!! -;(println "⭐️⭐️⭐️ split-and-detect-fragments - result (case 2):" (pr-str result)) - result))))) - - -(defn- attempt-to-parse-name - "Attempts to parse `n`ame into one or more SPDX expressions, by: - 1. Replacing listed names with their ids - 2. Replacing listed names with their ids - 3. Replacing 'tricky' names with their ids - 4. Parsing the input for any elements it contains that haven't yet been - converted into an id - - Returns an expressions-info map or `nil` if parsing fails to find any SPDX - expressions." - [n] -;####TEST!!!! -(println "⭐️⭐️⭐️ attempt-to-parse-name - input:" (pr-str n)) - ; 1. Replace near matches for SPDX listed ids - (let [[new-n eis] (replace-listed-ids-near-match n)] -;####TEST!!!! -(println "⭐️⭐️⭐️ attempt-to-parse-name - result of step 1 (replace ids):" (pr-str [new-n eis])) - (if-let [normalised-expression (sexp/normalise new-n)] - (lciei/prepend-source n {normalised-expression eis}) - ; 2. Replace near matches for SPDX listed names - (let [[new-n new-eis] (replace-listed-names-near-match new-n) - eis (concat eis new-eis)] -;####TEST!!!! -(println "⭐️⭐️⭐️ attempt-to-parse-name - result of step 2 (replace names):" (pr-str [new-n eis])) - (if-let [normalised-expression (sexp/normalise new-n)] - (lciei/prepend-source n {normalised-expression eis}) - ; 3. Replace tricky names (those with operators in them, primarily) - (let [[new-n new-eis] (replace-tricky-names new-n ) - eis (concat eis new-eis)] -;####TEST!!!! -(println "⭐️⭐️⭐️ attempt-to-parse-name - result of step 3 (replace tricky names):" (pr-str [new-n eis])) - (if-let [normalised-expression (sexp/normalise new-n)] - (lciei/prepend-source n {normalised-expression eis}) - ; 4. Split on operators then detect fragments - note: this is the (only) point where we can end up with multiple expressions - (when-let [fully-parsed-result (split-and-detect-fragments new-n eis)] -;####TEST!!!! -(println "⭐️⭐️⭐️ attempt-to-parse-name - result of step 4 (parse):" (pr-str fully-parsed-result)) - (lciei/prepend-source n fully-parsed-result))))))))) - - - -) \ No newline at end of file diff --git a/src/lice_comb/impl/spdx.clj b/src/lice_comb/impl/spdx.clj index 2823dd8..6d831f3 100644 --- a/src/lice_comb/impl/spdx.clj +++ b/src/lice_comb/impl/spdx.clj @@ -154,6 +154,8 @@ [s] (best-identifier (near-match-id s))) +;####TODO: REMOVE IF UNNEEDED!!!! +(comment (defn replace-near-match-ids-with-id "Replaces all near matched ids in `s` (a `String`) with their actual (best) SPDX id. Result is a tuple containing the modified `s` and a sequence of @@ -170,6 +172,7 @@ replacement (seq (filter #(not= (first %) (second %)) replacement)) ; Remove redundant replacements such as ["GPL-2.0-only" "GPL-2.0-only"] new-replacements (if replacement (apply conj replacements replacement) replacements)] (recur new-s new-replacements r)))))) +) (defn- name-version-replacement "Returns a regex fragment for a version number (e.g. 2.0.0) in a name." @@ -230,7 +233,7 @@ (s/replace "\\b\\s+" "\\s+") (s/replace #"(.*)\\b\z" "$1") ; End clauses - (str "(?!\\w)") + (str "(?\\s*\\+)?(?!\\w)") ; And finally compile the regex re-pattern))) @@ -240,8 +243,10 @@ ; * we use all ids (including deprecated ones) because the real world may include anything ; * we preserve the listed name of the license in the tuple so that we can determine the precise matching strategy (def name-regex-id-pairs-d (delay (concat (sort (by #(count (str (first %))) descending) (map #(vec [(name->regex (:name %)) (sexp/normalise (:id %)) (:name %)]) @full-license-list-d)) - (sort (by #(count (str (first %))) descending) (map #(vec [(name->regex (:name %)) % (:name %)]) @full-exception-list-d))))) ; Note: can't normalise a solitary exception id since they're not a valid expression alone + (sort (by #(count (str (first %))) descending) (map #(vec [(name->regex (:name %)) (:id %) (:name %)]) @full-exception-list-d))))) ; Note: can't normalise a solitary exception id since they're not a valid expression alone +;####TODO: REMOVE IF UNNEEDED!!!! +(comment (defn near-match-name "Returns the id(s) (a set) when `n`ame 'near matches' one or more license or exception names, or `nil` if `n` is blank or no near matches were found. The @@ -274,6 +279,7 @@ [new-s replacement] (lciu/explaining-replace s re id) new-replacements (if replacement (apply conj replacements replacement) replacements)] (recur new-s new-replacements r)))))) +) (defn- urls-to-id-tuples "Extracts all urls for a given list (license or exception) entry." @@ -438,7 +444,9 @@ (defn find-ids "Returns a sequence of the distinct listed SPDX license ids, exceptions ids, LicenseRefs and AdditionRefs found in `s` (a `String`), in the order they were - found, or `nil` if no listed ids were found or `s` was `nil`." + found, or `nil` if no listed ids were found or `s` was `nil`. + + Note: results are NOT normalised." [s] (when s (when-let [matches (map #(get % "Identifier") (rencg/re-seq-ncg (sre/ids-re) s))] diff --git a/src/lice_comb/impl/utils.clj b/src/lice_comb/impl/utils.clj index 6fbfa64..9f08c84 100644 --- a/src/lice_comb/impl/utils.clj +++ b/src/lice_comb/impl/utils.clj @@ -63,8 +63,8 @@ [k (f v)])))) (defn map-pad - "Like map, but when presented with multiple collections of different lengths, - 'pads out' the missing elements with nil rather than terminating early." + "`map`, but when presented with multiple collections of different lengths, + 'pads out' the missing elements with `nil` rather than terminating early." [f & cs] (loop [result nil firsts (map first cs) @@ -75,11 +75,49 @@ (map first rests) (map rest rests))))) +(defn map-pred + "`map` on `coll`, calling `f` for any/all values for which `pred` returns + logical true, passing through other values unchanged." + [pred f coll] + (when (and pred f coll) + (map #(if (pred %) + (f %) + %) + coll))) + +(defn mapcat-pred + "`mapcat` on `coll`, calling `f` for any/all values for which `pred` returns + logical true, passing through other values unchanged." + [pred f coll] + (when (and pred f coll) + (mapcat #(if (pred %) + (f %) + [%]) + coll))) + +(defn not-blank-string? + "`true` when `x` is not a blank `String`." + [x] + (boolean + (when x + (or (not (string? x)) + (not (s/blank? x)))))) + (defn strim - "nil safe version of clojure.string/trim" + "`nil` safe version of clojure.string/trim" [^String s] (when s (s/trim s))) +(defn trim-non-word + "Like clojure.string/trim, but trims `s` of all leading and trailing + characters that aren't alphanumeric (`[\\w]` regex class, with Unicode + support enabled)." + [s] + (when s + (-> s + (s/replace #"(?U)\A[^\w]+" "") + (s/replace #"(?U)[^\w]+\z" "")))) + (defn is-digits? "Does the given string contains digits only?" [^String s] @@ -123,25 +161,8 @@ [& res] (re-pattern (s/join res))) -(defn retained-split - "As for `clojure.string/split`, but retains whatever `re` matched as distinct - elements in the result." - [^CharSequence s ^java.util.regex.Pattern re] - (let [m (re-matcher re s)] - (loop [result [] - index 0 - f (.find m)] - (if f - (let [match-start (.start m) - match-end (.end m) - match (subs s match-start match-end)] - (if (= index match-start) - (recur (conj result match) match-end (.find m)) ; Back-to-back matches - (recur (vec (concat result [(subs s index match-start) match])) match-end (.find m)))) - (conj result (subs s index (count s))))))) - (defn replacing-split - "As for [[retained-split]], but replaces whatever `re` matched with + "As for `clojure.string/split`, but replaces whatever `re` matched with `replacement`, which can be a value or a function of one argument. Notes: @@ -168,6 +189,14 @@ (recur (vec (concat result [(subs s index match-start) rep])) match-end (.find m)))) (conj result (subs s index (count s))))))) +(defn retained-split + "As for `clojure.string/split`, but retains whatever `re` matched as distinct + elements in the result." + [^CharSequence s ^java.util.regex.Pattern re] + (replacing-split s re #(get % :match))) ; Can't use :match literally here, since `(fn? :keyword)` is always false + +;####TODO: REMOVE THIS IF UNNEEDED!!!! +(comment (defn explaining-replace "Similar to `clojure.string/replace`, but returns a tuple where the first element is the new `String`, and the second element is a sequence of tuples, @@ -198,6 +227,7 @@ (.appendTail m buffer) [(.toString buffer) replacements])))) [s nil]))) +) (defn digit-name-to-number "Converts the English name of a single digit (a `String`) to that number (as a @@ -458,6 +488,7 @@ (defn file-handle-bounded-pmap "bounded-pmap* hardcoded to no more than 8192 virtual threads. This size is determined conservatively from macOS, since it's the least common denominator - of the major OSes in terms of number of possible open file handles." + of the major OSes in terms of number of possible open file handles per + process." [f coll] (e/bounded-pmap* 8192 f coll)) diff --git a/src/lice_comb/matching.clj b/src/lice_comb/matching.clj index 97c540e..1193da5 100644 --- a/src/lice_comb/matching.clj +++ b/src/lice_comb/matching.clj @@ -24,12 +24,12 @@ information) An expressions-info map has this structure: - + * key: an SPDX expression (`String`) * value: a sequence of 'expression-info' maps An expression-info map has this structure: - + * `:id` (`String`, optional): The portion of the SPDX expression that this info map refers to (usually, though not always, a single SPDX identifier). diff --git a/test/lice_comb/impl/id_detection_test.clj b/test/lice_comb/impl/id_detection_test.clj index 2abce7f..4ce8171 100644 --- a/test/lice_comb/impl/id_detection_test.clj +++ b/test/lice_comb/impl/id_detection_test.clj @@ -240,5 +240,5 @@ (is (every? not-nil? (map (partial test-regex gnu-re) gnu-licenses))))) (deftest find-ids-tests - (testing-with-data "GNU Family Regexes - correct identifier results" #(mapcat keys (find-ids %)) gnu-licenses-and-ids) - (testing-with-data "CC Family Regexes - correct identifier results" #(mapcat keys (find-ids %)) cc-by-licenses-and-ids)) + (testing-with-data "GNU Family Regexes - correct identifier results" #(map :id (find-ids %)) gnu-licenses-and-ids) + (testing-with-data "CC Family Regexes - correct identifier results" #(map :id (find-ids %)) cc-by-licenses-and-ids))