Skip to content

file_utils

allennlp.common.file_utils

[SOURCE]


Utilities for working with the local dataset cache.

CACHE_ROOT#

CACHE_ROOT = Path(os.getenv("ALLENNLP_CACHE_ROOT", Path.home() / ".allennlp"))

CACHE_DIRECTORY#

CACHE_DIRECTORY = str(CACHE_ROOT / "cache")

DEPRECATED_CACHE_DIRECTORY#

DEPRECATED_CACHE_DIRECTORY = str(CACHE_ROOT / "datasets")

DATASET_CACHE#

DATASET_CACHE = CACHE_DIRECTORY

filename_to_url#

def filename_to_url(
    filename: str,
    cache_dir: Union[str, Path] = None
) -> Tuple[str, str]

Return the url and etag (which may be None) stored for filename. Raise FileNotFoundError if filename or its stored metadata do not exist.

cached_path#

def cached_path(
    url_or_filename: Union[str, PathLike],
    cache_dir: Union[str, Path] = None,
    extract_archive: bool = False,
    force_extract: bool = False
) -> str

Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path.

Parameters

  • url_or_filename : Union[str, Path]
    A URL or local file to parse and possibly download.

  • cache_dir : Union[str, Path], optional (default = None)
    The directory to cache downloads.

  • extract_archive : bool, optional (default = False)
    If True, then zip or tar.gz archives will be automatically extracted. In which case the directory is returned.

  • force_extract : bool, optional (default = False)
    If True and the file is an archive file, it will be extracted regardless of whether or not the extracted directory already exists.

is_url_or_existing_file#

def is_url_or_existing_file(
    url_or_filename: Union[str, Path, None]
) -> bool

Given something that might be a URL (or might be a local path), determine check if it's url or an existing file path.

CacheFile#

class CacheFile:
 | def __init__(
 |     self,
 |     cache_filename: Union[Path, str],
 |     mode: str = "w+b",
 |     suffix: str = ".tmp"
 | ) -> None

This is a context manager that makes robust caching easier.

On __enter__, an IO handle to a temporarily file is returned, which can be treated as if it's the actual cache file.

On __exit__, the temporarily file is renamed to the cache file. If anything goes wrong while writing to the temporary file, it will be removed.

get_from_cache#

def get_from_cache(
    url: str,
    cache_dir: Union[str, Path] = None
) -> str

Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file.

read_set_from_file#

def read_set_from_file(filename: str) -> Set[str]

Extract a de-duped collection (set) of text from a file. Expected file format is one item per line.

get_file_extension#

def get_file_extension(path: str, dot=True, lower: bool = True)

open_compressed#

def open_compressed(
    filename: Union[str, Path],
    mode: str = "rt",
    encoding: Optional[str] = "UTF-8",
    **kwargs
)

text_lines_from_file#

def text_lines_from_file(
    filename: Union[str, Path],
    strip_lines: bool = True
) -> Iterator[str]

json_lines_from_file#

def json_lines_from_file(
    filename: Union[str, Path]
) -> Iterable[Union[list, dict]]

remove_cache_entries#

def remove_cache_entries(
    patterns: List[str],
    cache_dir: Union[str, Path] = None
) -> int

Remove cache entries matching the given patterns.

Returns the total reclaimed space in bytes.

inspect_cache#

def inspect_cache(
    patterns: List[str] = None,
    cache_dir: Union[str, Path] = None
)

Print out useful information about the cache directory.