diff --git a/src/matchcode_toolkit/fingerprinting.py b/src/matchcode_toolkit/fingerprinting.py index 88aafcb..e2b7ec0 100644 --- a/src/matchcode_toolkit/fingerprinting.py +++ b/src/matchcode_toolkit/fingerprinting.py @@ -13,6 +13,8 @@ from licensedcode.tokenize import query_lines from samecode.halohash import BitAverageHaloHash +from matchcode_toolkit.stemming import get_stem_code + # A collection of directory fingerprints that we want to avoid IGNORED_DIRECTORY_FINGERPRINTS = [ # This is both the directory content and directory structure fingerprint for @@ -224,7 +226,43 @@ def get_file_fingerprint_hashes( content = f.read() return create_file_fingerprints( - content, + content=content, + ngram_length=ngram_length, + window_length=window_length, + include_ngrams=include_ngrams, + ) + + +def get_stemmed_file_fingerprint_hashes( + location, + ngram_length=5, + window_length=16, + include_ngrams=False, + **kwargs, +): + """ + Return a mapping of stemmed code fingerprint hashes for the file at `location` + + The `halo1` hash is the hex digest of the fingerprint of the file. + `halo1` is empty if the file is empty. + + - We start by breaking the file into words (tokens) + - We compute ngrams over the list of tokens + + Return an empty mapping if `location` is not a text file + """ + from commoncode import filetype + from typecode.contenttype import get_type + + # Do not process `location` if it's not a text file + ft = get_type(location) + if not (filetype.is_file(location) and ft.is_text): + return {} + + stemmed_content = get_stem_code(location=location) + + return create_file_fingerprints( + stemmed_content=stemmed_content, ngram_length=ngram_length, window_length=window_length, include_ngrams=include_ngrams, @@ -232,20 +270,37 @@ def get_file_fingerprint_hashes( def create_file_fingerprints( - content, ngram_length=5, window_length=SNIPPET_WINDOW_LENGTH, include_ngrams=False + content=None, + stemmed_content=None, + ngram_length=5, + window_length=SNIPPET_WINDOW_LENGTH, + include_ngrams=False, ): """ - Return a mapping of halo1 and snippet hashes from content string + Return a mapping of halo1 and snippet hashes from `content` or `stemmed_content`, not both. """ from licensedcode.tokenize import ngrams from licensedcode.tokenize import select_ngrams + if content and stemmed_content: + raise Exception( + "create_file_fingerprints only accepts an input of `content` or `stemmed_content`, not both." + ) + + if stemmed_content: + halo1_key = "stemmed_halo1" + snippets_key = "stemmed_snippets" + else: + halo1_key = "halo1" + snippets_key = "snippets" + fingerprints = { - "halo1": "", - "snippets": [], + halo1_key: "", + snippets_key: [], } # tokenize content into words + content = content or stemmed_content words = list(tokenizer(content)) # Create a file fingerprint from the number of elements in the content hash @@ -259,7 +314,7 @@ def create_file_fingerprints( content_fingerprint = content_hash.hexdigest().decode("utf-8") ngs_count_hex_str = "%08x" % ngs_count file_fingerprint = ngs_count_hex_str + content_fingerprint - fingerprints["halo1"] = file_fingerprint + fingerprints[halo1_key] = file_fingerprint # Select windows from the content to compute snippet fingerprints windows = ngrams(words, window_length) @@ -279,7 +334,7 @@ def create_file_fingerprints( s["ngrams"] = list(window) snippets.append(s) if snippets: - fingerprints["snippets"] = snippets + fingerprints[snippets_key] = snippets return fingerprints diff --git a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py index f4d20a1..c74bb99 100644 --- a/src/matchcode_toolkit/pipelines/fingerprint_codebase.py +++ b/src/matchcode_toolkit/pipelines/fingerprint_codebase.py @@ -42,3 +42,4 @@ def fingerprint_codebase(self): """ matchcode.fingerprint_codebase_directories(self.project) matchcode.fingerprint_codebase_resources(self.project) + matchcode.fingerprint_stemmed_codebase_resources(self.project)