file_utils

allennlp.common.file_utils

Utilities for working with the local dataset cache.

CACHE_ROOT#

CACHE_ROOT = Path(os.getenv("ALLENNLP_CACHE_ROOT", Path.home() / ".allennlp"))

CACHE_DIRECTORY#

CACHE_DIRECTORY = str(CACHE_ROOT / "cache")

DEPRECATED_CACHE_DIRECTORY#

DEPRECATED_CACHE_DIRECTORY = str(CACHE_ROOT / "datasets")

DATASET_CACHE#

DATASET_CACHE = CACHE_DIRECTORY

FileLock#

class FileLock(_FileLock):
 | def __init__(
 |     self,
 |     lock_file: Union[str, PathLike],
 |     timeout=-1,
 |     read_only_ok: bool = False
 | ) -> None

This is just a subclass of the FileLock class from the filelock library, except that it adds an additional argument to the __init__ method: read_only_ok.

By default this flag is False, which an exception will be thrown when a lock can't be acquired due to lack of write permissions. But if this flag is set to True, a warning will be emitted instead of an error when the lock already exists but the lock can't be acquired because write access is blocked.

acquire#

class FileLock(_FileLock):
 | ...
 | @overrides
 | def acquire(self, timeout=None, poll_interval=0.05)

filename_to_url#

def filename_to_url(
    filename: str,
    cache_dir: Union[str, Path] = None
) -> Tuple[str, str]

Return the url and etag (which may be None) stored for filename. Raise FileNotFoundError if filename or its stored metadata do not exist.

check_tarfile#

def check_tarfile(tar_file: tarfile.TarFile)

Tar files can contain files outside of the extraction directory, or symlinks that point outside the extraction directory. We also don't want any block devices fifos, or other weird file types extracted. This checks for those issues and throws an exception if there is a problem.

cached_path#

def cached_path(
    url_or_filename: Union[str, PathLike],
    cache_dir: Union[str, Path] = None,
    extract_archive: bool = False,
    force_extract: bool = False
) -> str

Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path.

Parameters

url_or_filename : Union[str, Path]
A URL or local file to parse and possibly download.
cache_dir : Union[str, Path], optional (default = None)
The directory to cache downloads.
extract_archive : bool, optional (default = False)
If True, then zip or tar.gz archives will be automatically extracted. In which case the directory is returned.
force_extract : bool, optional (default = False)
If True and the file is an archive file, it will be extracted regardless of whether or not the extracted directory already exists.

is_url_or_existing_file#

def is_url_or_existing_file(
    url_or_filename: Union[str, Path, None]
) -> bool

Given something that might be a URL (or might be a local path), determine check if it's url or an existing file path.

CacheFile#

class CacheFile:
 | def __init__(
 |     self,
 |     cache_filename: Union[Path, str],
 |     mode: str = "w+b",
 |     suffix: str = ".tmp"
 | ) -> None

This is a context manager that makes robust caching easier.

On __enter__, an IO handle to a temporarily file is returned, which can be treated as if it's the actual cache file.

On __exit__, the temporarily file is renamed to the cache file. If anything goes wrong while writing to the temporary file, it will be removed.

get_from_cache#

def get_from_cache(
    url: str,
    cache_dir: Union[str, Path] = None
) -> str

Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file.

read_set_from_file#

def read_set_from_file(filename: str) -> Set[str]

Extract a de-duped collection (set) of text from a file. Expected file format is one item per line.

get_file_extension#

def get_file_extension(path: str, dot=True, lower: bool = True)

open_compressed#

def open_compressed(
    filename: Union[str, Path],
    mode: str = "rt",
    encoding: Optional[str] = "UTF-8",
    **kwargs
)

text_lines_from_file#

def text_lines_from_file(
    filename: Union[str, Path],
    strip_lines: bool = True
) -> Iterator[str]

json_lines_from_file#

def json_lines_from_file(
    filename: Union[str, Path]
) -> Iterable[Union[list, dict]]

remove_cache_entries#

def remove_cache_entries(
    patterns: List[str],
    cache_dir: Union[str, Path] = None
) -> int

Remove cache entries matching the given patterns.

Returns the total reclaimed space in bytes.

inspect_cache#

def inspect_cache(
    patterns: List[str] = None,
    cache_dir: Union[str, Path] = None
)

Print out useful information about the cache directory.