Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 11 additions & 32 deletions libs/core/langchain_core/indexing/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import hashlib
import json
import uuid
import warnings

Check failure on line 8 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.11) / Python 3.11

Ruff (F401)

langchain_core/indexing/api.py:8:8: F401 `warnings` imported but unused

Check failure on line 8 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.10) / Python 3.10

Ruff (F401)

langchain_core/indexing/api.py:8:8: F401 `warnings` imported but unused

Check failure on line 8 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.12) / Python 3.12

Ruff (F401)

langchain_core/indexing/api.py:8:8: F401 `warnings` imported but unused

Check failure on line 8 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.13) / Python 3.13

Ruff (F401)

langchain_core/indexing/api.py:8:8: F401 `warnings` imported but unused
from collections.abc import (
AsyncIterable,
AsyncIterator,
Expand Down Expand Up @@ -39,40 +39,22 @@


def _hash_string_to_uuid(input_string: str) -> str:
"""Hashes a string and returns the corresponding UUID."""
hash_value = hashlib.sha1(
input_string.encode("utf-8"), usedforsecurity=False
).hexdigest()
"""Hashes a string (using SHA-256) and returns the corresponding UUID."""
hash_value = hashlib.sha256(input_string.encode("utf-8")).hexdigest()
return str(uuid.uuid5(NAMESPACE_UUID, hash_value))


_WARNED_ABOUT_SHA1: bool = False


def _warn_about_sha1() -> None:
"""Emit a one-time warning about SHA-1 collision weaknesses."""
# Global variable OK in this case
global _WARNED_ABOUT_SHA1 # noqa: PLW0603
if not _WARNED_ABOUT_SHA1:
warnings.warn(
"Using SHA-1 for document hashing. SHA-1 is *not* "
"collision-resistant; a motivated attacker can construct distinct inputs "
"that map to the same fingerprint. If this matters in your "
"threat model, switch to a stronger algorithm such "
"as 'blake2b', 'sha256', or 'sha512' by specifying "
" `key_encoder` parameter in the `index` or `aindex` function. ",
category=UserWarning,
stacklevel=2,
)
_WARNED_ABOUT_SHA1 = True

# (SHA-1 warning functionality removed; SHA-256 is now default)

def _hash_string(
input_string: str, *, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
) -> uuid.UUID:
"""Hash *input_string* to a deterministic UUID using the configured algorithm."""
if algorithm == "sha1":
_warn_about_sha1()

Check failure on line 57 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.11) / Python 3.11

Ruff (F821)

langchain_core/indexing/api.py:57:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 57 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.10) / Python 3.10

Ruff (F821)

langchain_core/indexing/api.py:57:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 57 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.12) / Python 3.12

Ruff (F821)

langchain_core/indexing/api.py:57:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 57 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.13) / Python 3.13

Ruff (F821)

langchain_core/indexing/api.py:57:9: F821 Undefined name `_warn_about_sha1`
hash_value = _calculate_hash(input_string, algorithm)
return uuid.uuid5(NAMESPACE_UUID, hash_value)

Expand Down Expand Up @@ -146,13 +128,12 @@


def _calculate_hash(
text: str, algorithm: Literal["sha1", "sha256", "sha512", "blake2b"]
text: str, algorithm: Literal["sha256", "sha512", "blake2b", "sha1"] = "sha256"
) -> str:
"""Return a hexadecimal digest of *text* using *algorithm*."""
"""Return a hexadecimal digest of *text* using *algorithm*. Defaults to sha256."""
if algorithm == "sha1":
# Calculate the SHA-1 hash and return it as a UUID.
digest = hashlib.sha1(text.encode("utf-8"), usedforsecurity=False).hexdigest()
return str(uuid.uuid5(NAMESPACE_UUID, digest))
# SHA-1 is not recommended and should only be used for legacy compatibility
return hashlib.sha1(text.encode("utf-8")).hexdigest()

Check failure on line 136 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.11) / Python 3.11

Ruff (S324)

langchain_core/indexing/api.py:136:16: S324 Probable use of insecure hash functions in `hashlib`: `sha1`

Check failure on line 136 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.10) / Python 3.10

Ruff (S324)

langchain_core/indexing/api.py:136:16: S324 Probable use of insecure hash functions in `hashlib`: `sha1`

Check failure on line 136 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.12) / Python 3.12

Ruff (S324)

langchain_core/indexing/api.py:136:16: S324 Probable use of insecure hash functions in `hashlib`: `sha1`

Check failure on line 136 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.13) / Python 3.13

Ruff (S324)

langchain_core/indexing/api.py:136:16: S324 Probable use of insecure hash functions in `hashlib`: `sha1`
if algorithm == "blake2b":
return hashlib.blake2b(text.encode("utf-8")).hexdigest()
if algorithm == "sha256":
Expand All @@ -177,13 +158,11 @@
Args:
document: Document to hash.
key_encoder: Hashing algorithm to use for hashing the document.
If not provided, a default encoder using SHA-1 will be used.
SHA-1 is not collision-resistant, and a motivated attacker
could craft two different texts that hash to the
same cache key.
If not provided, a default encoder using SHA-256 will be used.
SHA-256 is collision-resistant and recommended for new applications.

New applications should use one of the alternative encoders
or provide a custom and strong key encoder function to avoid this risk.
Applications may configure an alternative strong encoder
or provide a custom key encoder function.

When changing the key encoder, you must change the
index as well to avoid duplicated documents in the cache.
Expand Down Expand Up @@ -388,7 +367,7 @@
# Behavior is deprecated, but we keep it for backwards compatibility.
# # Warn only once per process.
if key_encoder == "sha1":
_warn_about_sha1()

Check failure on line 370 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.11) / Python 3.11

Ruff (F821)

langchain_core/indexing/api.py:370:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 370 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.10) / Python 3.10

Ruff (F821)

langchain_core/indexing/api.py:370:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 370 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.12) / Python 3.12

Ruff (F821)

langchain_core/indexing/api.py:370:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 370 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.13) / Python 3.13

Ruff (F821)

langchain_core/indexing/api.py:370:9: F821 Undefined name `_warn_about_sha1`

if cleanup not in {"incremental", "full", "scoped_full", None}:
msg = (
Expand Down Expand Up @@ -729,7 +708,7 @@
# Behavior is deprecated, but we keep it for backwards compatibility.
# # Warn only once per process.
if key_encoder == "sha1":
_warn_about_sha1()

Check failure on line 711 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.11) / Python 3.11

Ruff (F821)

langchain_core/indexing/api.py:711:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 711 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.10) / Python 3.10

Ruff (F821)

langchain_core/indexing/api.py:711:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 711 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.12) / Python 3.12

Ruff (F821)

langchain_core/indexing/api.py:711:9: F821 Undefined name `_warn_about_sha1`

Check failure on line 711 in libs/core/langchain_core/indexing/api.py

View workflow job for this annotation

GitHub Actions / lint (libs/core, 3.13) / Python 3.13

Ruff (F821)

langchain_core/indexing/api.py:711:9: F821 Undefined name `_warn_about_sha1`

if cleanup not in {"incremental", "full", "scoped_full", None}:
msg = (
Expand Down
Loading