langchain[minor]: enhance LocalFileStore to offer update_atime parameter that updates access times on read (#20951)

**Description:**
The `LocalFileStore` class can be used to create an on-disk
`CacheBackedEmbeddings` cache. The number of files in these embeddings
caches can grow to be quite large over time (hundreds of thousands) as
embeddings are computed for new versions of content, but the embeddings
for old/deprecated content are not removed.

A *least-recently-used* (LRU) cache policy could be applied to the
`LocalFileStore` directory to delete cache entries that have not been
referenced for some time:

```bash
# delete files that have not been accessed in the last 90 days
find embeddings_cache_dir/ -atime 90 -print0 | xargs -0 rm
```

However, most filesystems in enterprise environments disable access time
modification on read to improve performance. As a result, the access
times of these cache entry files are not updated when their values are
read.

To resolve this, this pull request updates the `LocalFileStore`
constructor to offer an `update_atime` parameter that causes access
times to be updated when a cache entry is read.

For example,

```python
file_store = LocalFileStore(temp_dir, update_atime=True)
```

The default is `False`, which retains the original behavior.

**Testing:**
I updated the LocalFileStore unit tests to test the access time update.
This commit is contained in:
Chris Papademetrious 2024-05-06 11:52:29 -04:00 committed by GitHub
parent 5b6d1a907d
commit ee6c922c91
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 35 additions and 0 deletions

View File

@ -1,5 +1,6 @@
import os
import re
import time
from pathlib import Path
from typing import Iterator, List, Optional, Sequence, Tuple, Union
@ -42,6 +43,7 @@ class LocalFileStore(ByteStore):
*,
chmod_file: Optional[int] = None,
chmod_dir: Optional[int] = None,
update_atime: bool = False,
) -> None:
"""Implement the BaseStore interface for the local file system.
@ -52,10 +54,15 @@ class LocalFileStore(ByteStore):
for newly created files, overriding the current `umask` if needed.
chmod_dir: (optional, defaults to `None`) If specified, sets permissions
for newly created dirs, overriding the current `umask` if needed.
update_atime: (optional, defaults to `False`) If `True`, updates the
filesystem access time (but not the modified time) when a file is read.
This allows MRU/LRU cache policies to be implemented for filesystems
where access time updates are disabled.
"""
self.root_path = Path(root_path).absolute()
self.chmod_file = chmod_file
self.chmod_dir = chmod_dir
self.update_atime = update_atime
def _get_full_path(self, key: str) -> Path:
"""Get the full path for a given key relative to the root path.
@ -112,6 +119,9 @@ class LocalFileStore(ByteStore):
if full_path.exists():
value = full_path.read_bytes()
values.append(value)
if self.update_atime:
# update access time only; preserve modified time
os.utime(full_path, (time.time(), os.stat(full_path).st_mtime))
else:
values.append(None)
return values

View File

@ -57,6 +57,31 @@ def test_mset_chmod(chmod_dir_s: str, chmod_file_s: str) -> None:
assert (os.stat(file_path).st_mode & 0o777) == chmod_file
def test_mget_update_atime() -> None:
# Create a temporary directory for testing
with tempfile.TemporaryDirectory() as temp_dir:
# Instantiate the LocalFileStore with a directory inside the temporary directory
# as the root path
temp_dir = os.path.join(temp_dir, "store_dir")
file_store = LocalFileStore(temp_dir, update_atime=True)
# Set values for keys
key_value_pairs = [("key1", b"value1"), ("key2", b"value2")]
file_store.mset(key_value_pairs)
# Get original access time
dir_path = str(file_store.root_path)
file_path = os.path.join(dir_path, "key1")
atime1 = os.stat(file_path).st_atime
# Get values for keys
_ = file_store.mget(["key1", "key2"])
# Make sure the filesystem access time has been updated
atime2 = os.stat(file_path).st_atime
assert atime2 != atime1
def test_mdelete(file_store: LocalFileStore) -> None:
# Set values for keys
key_value_pairs = [("key1", b"value1"), ("key2", b"value2")]