mirror of
https://github.com/hwchase17/langchain.git
synced 2025-07-01 10:54:15 +00:00
langchain[minor]: enhance LocalFileStore
to offer update_atime
parameter that updates access times on read (#20951)
**Description:** The `LocalFileStore` class can be used to create an on-disk `CacheBackedEmbeddings` cache. The number of files in these embeddings caches can grow to be quite large over time (hundreds of thousands) as embeddings are computed for new versions of content, but the embeddings for old/deprecated content are not removed. A *least-recently-used* (LRU) cache policy could be applied to the `LocalFileStore` directory to delete cache entries that have not been referenced for some time: ```bash # delete files that have not been accessed in the last 90 days find embeddings_cache_dir/ -atime 90 -print0 | xargs -0 rm ``` However, most filesystems in enterprise environments disable access time modification on read to improve performance. As a result, the access times of these cache entry files are not updated when their values are read. To resolve this, this pull request updates the `LocalFileStore` constructor to offer an `update_atime` parameter that causes access times to be updated when a cache entry is read. For example, ```python file_store = LocalFileStore(temp_dir, update_atime=True) ``` The default is `False`, which retains the original behavior. **Testing:** I updated the LocalFileStore unit tests to test the access time update.
This commit is contained in:
parent
5b6d1a907d
commit
ee6c922c91
@ -1,5 +1,6 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import Iterator, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
@ -42,6 +43,7 @@ class LocalFileStore(ByteStore):
|
||||
*,
|
||||
chmod_file: Optional[int] = None,
|
||||
chmod_dir: Optional[int] = None,
|
||||
update_atime: bool = False,
|
||||
) -> None:
|
||||
"""Implement the BaseStore interface for the local file system.
|
||||
|
||||
@ -52,10 +54,15 @@ class LocalFileStore(ByteStore):
|
||||
for newly created files, overriding the current `umask` if needed.
|
||||
chmod_dir: (optional, defaults to `None`) If specified, sets permissions
|
||||
for newly created dirs, overriding the current `umask` if needed.
|
||||
update_atime: (optional, defaults to `False`) If `True`, updates the
|
||||
filesystem access time (but not the modified time) when a file is read.
|
||||
This allows MRU/LRU cache policies to be implemented for filesystems
|
||||
where access time updates are disabled.
|
||||
"""
|
||||
self.root_path = Path(root_path).absolute()
|
||||
self.chmod_file = chmod_file
|
||||
self.chmod_dir = chmod_dir
|
||||
self.update_atime = update_atime
|
||||
|
||||
def _get_full_path(self, key: str) -> Path:
|
||||
"""Get the full path for a given key relative to the root path.
|
||||
@ -112,6 +119,9 @@ class LocalFileStore(ByteStore):
|
||||
if full_path.exists():
|
||||
value = full_path.read_bytes()
|
||||
values.append(value)
|
||||
if self.update_atime:
|
||||
# update access time only; preserve modified time
|
||||
os.utime(full_path, (time.time(), os.stat(full_path).st_mtime))
|
||||
else:
|
||||
values.append(None)
|
||||
return values
|
||||
|
@ -57,6 +57,31 @@ def test_mset_chmod(chmod_dir_s: str, chmod_file_s: str) -> None:
|
||||
assert (os.stat(file_path).st_mode & 0o777) == chmod_file
|
||||
|
||||
|
||||
def test_mget_update_atime() -> None:
|
||||
# Create a temporary directory for testing
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Instantiate the LocalFileStore with a directory inside the temporary directory
|
||||
# as the root path
|
||||
temp_dir = os.path.join(temp_dir, "store_dir")
|
||||
file_store = LocalFileStore(temp_dir, update_atime=True)
|
||||
|
||||
# Set values for keys
|
||||
key_value_pairs = [("key1", b"value1"), ("key2", b"value2")]
|
||||
file_store.mset(key_value_pairs)
|
||||
|
||||
# Get original access time
|
||||
dir_path = str(file_store.root_path)
|
||||
file_path = os.path.join(dir_path, "key1")
|
||||
atime1 = os.stat(file_path).st_atime
|
||||
|
||||
# Get values for keys
|
||||
_ = file_store.mget(["key1", "key2"])
|
||||
|
||||
# Make sure the filesystem access time has been updated
|
||||
atime2 = os.stat(file_path).st_atime
|
||||
assert atime2 != atime1
|
||||
|
||||
|
||||
def test_mdelete(file_store: LocalFileStore) -> None:
|
||||
# Set values for keys
|
||||
key_value_pairs = [("key1", b"value1"), ("key2", b"value2")]
|
||||
|
Loading…
Reference in New Issue
Block a user