Skip to content

file_utils

[ allennlp.common.file_utils ]


Utilities for working with the local dataset cache.

CACHE_ROOT#

CACHE_ROOT = Path(os.getenv("ALLENNLP_CACHE_ROOT", Path.home() / ".allennlp"))

CACHE_DIRECTORY#

CACHE_DIRECTORY = str(CACHE_ROOT / "cache")

DEPRECATED_CACHE_DIRECTORY#

DEPRECATED_CACHE_DIRECTORY = str(CACHE_ROOT / "datasets")

DATASET_CACHE#

DATASET_CACHE = CACHE_DIRECTORY

url_to_filename#

def url_to_filename(url: str, etag: str = None) -> str

Convert url into a hashed filename in a repeatable way. If etag is specified, append its hash to the url's, delimited by a period.

filename_to_url#

def filename_to_url(
    filename: str,
    cache_dir: str = None
) -> Tuple[str, str]

Return the url and etag (which may be None) stored for filename. Raise FileNotFoundError if filename or its stored metadata do not exist.

cached_path#

def cached_path(
    url_or_filename: Union[str, Path],
    cache_dir: str = None
) -> str

Given something that might be a URL (or might be a local path), determine which. If it's a URL, download the file and cache it, and return the path to the cached file. If it's already a local path, make sure the file exists and then return the path.

is_url_or_existing_file#

def is_url_or_existing_file(
    url_or_filename: Union[str, Path, None]
) -> bool

Given something that might be a URL (or might be a local path), determine check if it's url or an existing file path.

CacheFile Objects#

class CacheFile():
 | def __init__(
 |     self,
 |     cache_filename: Union[Path, str],
 |     mode="w+b"
 | ) -> None

This is a context manager that makes robust caching easier.

On __enter__, an IO handle to a temporarily file is returned, which can be treated as if it's the actual cache file.

On __exit__, the temporarily file is renamed to the cache file. If anything goes wrong while writing to the temporary file, it will be removed.

get_from_cache#

def get_from_cache(url: str, cache_dir: str = None) -> str

Given a URL, look for the corresponding dataset in the local cache. If it's not there, download it. Then return the path to the cached file.

read_set_from_file#

def read_set_from_file(filename: str) -> Set[str]

Extract a de-duped collection (set) of text from a file. Expected file format is one item per line.

get_file_extension#

def get_file_extension(path: str, dot=True, lower: bool = True)

open_compressed#

def open_compressed(
    filename: Union[str, Path],
    mode: str = "rt",
    encoding: Optional[str] = "UTF-8",
    **kwargs
)