blob: 21004acc118d4c14ec178e09ff461d93a743197c [file] [log] [blame]
"""
.. image::
../_static/files-generic.png
This backend stores responses in files on the local filesystem, with one file per response.
Use Cases
^^^^^^^^^
This backend is useful if you would like to use your cached response data outside of requests-cache,
for example:
* Manually viewing cached responses without the need for extra tools (e.g., with a simple text editor)
* Using cached responses as sample data for automated tests
* Reading cached responses directly from another application or library, without depending on
requests-cache
File Formats
^^^^^^^^^^^^
By default, responses are saved as pickle files. If you want to save responses in a human-readable
format, you can use one of the other available :ref:`serializers`. For example, to save responses as
JSON files:
>>> session = CachedSession('~/http_cache', backend='filesystem', serializer='json')
>>> session.get('https://httpbin.org/get')
>>> print(list(session.cache.paths()))
['/home/user/http_cache/4dc151d95200ec.json']
Or as YAML (requires ``pyyaml``):
>>> session = CachedSession('~/http_cache', backend='filesystem', serializer='yaml')
>>> session.get('https://httpbin.org/get')
>>> print(list(session.cache.paths()))
['/home/user/http_cache/4dc151d95200ec.yaml']
Cache Files
^^^^^^^^^^^
* See :ref:`files` for general info on specifying cache paths
* The path for a given response will be in the format ``<cache_name>/<cache_key>``
* Redirects are stored in a separate SQLite database, located at ``<cache_name>/redirects.sqlite``
* Use :py:meth:`.FileCache.paths` to get a list of all cached response paths
Performance and Limitations
^^^^^^^^^^^^^^^^^^^^^^^^^^^
* Write performance will vary based on the serializer used, in the range of roughly 1-3ms per write.
* This backend stores response files in a single directory, and does not currently implement
fan-out. This means that on most filesystems, storing a very large number of responses will result
in reduced performance.
* This backend currently uses a simple threading lock rather than a file lock system, so it is not
an ideal choice for highly parallel applications.
API Reference
^^^^^^^^^^^^^
.. automodsumm:: requests_cache.backends.filesystem
:classes-only:
:nosignatures:
"""
from contextlib import contextmanager
from os import makedirs
from pathlib import Path
from pickle import PickleError
from shutil import rmtree
from threading import RLock
from typing import Iterator
from ..serializers import SERIALIZERS
from . import BaseCache, BaseStorage
from .sqlite import AnyPath, SQLiteDict, get_cache_path
class FileCache(BaseCache):
"""Filesystem backend.
Args:
cache_name: Base directory for cache files
use_cache_dir: Store datebase in a user cache directory (e.g., `~/.cache/`)
use_temp: Store cache files in a temp directory (e.g., ``/tmp/http_cache/``).
Note: if ``cache_name`` is an absolute path, this option will be ignored.
extension: Extension for cache files. If not specified, the serializer default extension
will be used.
"""
def __init__(self, cache_name: AnyPath = 'http_cache', use_temp: bool = False, **kwargs):
super().__init__(**kwargs)
self.responses: FileDict = FileDict(cache_name, use_temp=use_temp, **kwargs)
self.redirects: SQLiteDict = SQLiteDict(
self.cache_dir / 'redirects.sqlite', 'redirects', **kwargs
)
@property
def cache_dir(self) -> Path:
"""Base directory for cache files"""
return Path(self.responses.cache_dir)
def paths(self) -> Iterator[Path]:
"""Get absolute file paths to all cached responses"""
return self.responses.paths()
def clear(self):
"""Clear the cache"""
# FileDict.clear() removes the cache directory, including redirects.sqlite
self.responses.clear()
self.redirects.init_db()
def remove_expired_responses(self, *args, **kwargs):
with self.responses._lock:
return super().remove_expired_responses(*args, **kwargs)
class FileDict(BaseStorage):
"""A dictionary-like interface to files on the local filesystem"""
def __init__(
self,
cache_name: AnyPath,
use_temp: bool = False,
use_cache_dir: bool = False,
extension: str = None,
**kwargs,
):
super().__init__(**kwargs)
self.cache_dir = get_cache_path(cache_name, use_cache_dir=use_cache_dir, use_temp=use_temp)
self.extension = _get_extension(extension, self.serializer)
self.is_binary = getattr(self.serializer, 'is_binary', False)
self._lock = RLock()
makedirs(self.cache_dir, exist_ok=True)
@contextmanager
def _try_io(self, ignore_errors: bool = False):
"""Attempt an I/O operation, and either ignore errors or re-raise them as KeyErrors"""
try:
with self._lock:
yield
except (EOFError, IOError, OSError, PickleError) as e:
if not ignore_errors:
raise KeyError(e)
def _path(self, key) -> Path:
return self.cache_dir / f'{key}{self.extension}'
def __getitem__(self, key):
mode = 'rb' if self.is_binary else 'r'
with self._try_io():
with self._path(key).open(mode) as f:
return self.serializer.loads(f.read())
def __delitem__(self, key):
with self._try_io():
self._path(key).unlink()
def __setitem__(self, key, value):
with self._try_io():
with self._path(key).open(mode='wb' if self.is_binary else 'w') as f:
f.write(self.serializer.dumps(value))
def __iter__(self):
yield from self.keys()
def __len__(self):
return sum(1 for _ in self.paths())
def clear(self):
with self._try_io(ignore_errors=True):
rmtree(self.cache_dir, ignore_errors=True)
self.cache_dir.mkdir()
def keys(self):
return [path.stem for path in self.paths()]
def paths(self) -> Iterator[Path]:
"""Get absolute file paths to all cached responses"""
with self._lock:
return self.cache_dir.glob(f'*{self.extension}')
def _get_extension(extension: str = None, serializer=None) -> str:
"""Use either the provided file extension, or get the serializer's default extension"""
if extension:
return f'.{extension}'
for name, obj in SERIALIZERS.items():
if serializer is obj:
return '.' + name.replace('pickle', 'pkl')
return ''