file_utils
allennlp.common.file_utils
Utilities for working with the local dataset cache.
CACHE_ROOT¶
CACHE_ROOT = Path(os.getenv("ALLENNLP_CACHE_ROOT", Path.home() / ".allennlp"))
CACHE_DIRECTORY¶
CACHE_DIRECTORY = str(CACHE_ROOT / "cache")
DEPRECATED_CACHE_DIRECTORY¶
DEPRECATED_CACHE_DIRECTORY = str(CACHE_ROOT / "datasets")
DATASET_CACHE¶
DATASET_CACHE = CACHE_DIRECTORY
filename_to_url¶
def filename_to_url(
filename: str,
cache_dir: Union[str, Path] = None
) -> Tuple[str, str]
Return the url and etag (which may be None
) stored for filename
.
Raise FileNotFoundError
if filename
or its stored metadata do not exist.
cached_path¶
def cached_path(
url_or_filename: Union[str, PathLike],
cache_dir: Union[str, Path] = None,
extract_archive: bool = False,
force_extract: bool = False
) -> str
Given something that might be a URL or local path, determine which. If it's a remote resource, download the file and cache it, and then return the path to the cached file. If it's already a local path, make sure the file exists and return the path.
For URLs, "http://", "https://", "s3://", "gs://", and "hf://" are all supported. The latter corresponds to the HuggingFace Hub.
For example, to download the PyTorch weights for the model epwalsh/bert-xsmall-dummy
on HuggingFace, you could do:
cached_path("hf://epwalsh/bert-xsmall-dummy/pytorch_model.bin")
For paths or URLs that point to a tarfile or zipfile, you can also add a path
to a specific file to the url_or_filename
preceeded by a "!", and the archive will
be automatically extracted (provided you set extract_archive
to True
),
returning the local path to the specific file. For example:
cached_path("model.tar.gz!weights.th", extract_archive=True)
Parameters¶
-
url_or_filename :
Union[str, Path]
A URL or path to parse and possibly download. -
cache_dir :
Union[str, Path]
, optional (default =None
)
The directory to cache downloads. -
extract_archive :
bool
, optional (default =False
)
IfTrue
, then zip or tar.gz archives will be automatically extracted. In which case the directory is returned. -
force_extract :
bool
, optional (default =False
)
IfTrue
and the file is an archive file, it will be extracted regardless of whether or not the extracted directory already exists.Warning
Use this flag with caution! This can lead to race conditions if used from multiple processes on the same file.
TensorCache¶
class TensorCache(MutableMapping[str, Tensor], ABC):
| def __init__(
| self,
| filename: Union[str, PathLike],
| *, map_size: int = 1024 * 1024 * 1024 * 1024,
| *, read_only: bool = False
| ) -> None
This is a key-value store, mapping strings to tensors. The data is kept on disk, making this class useful as a cache for storing tensors.
TensorCache
is also safe to access from multiple processes at the same time, so
you can use it in distributed training situations, or from multiple training
runs at the same time.
read_only¶
class TensorCache(MutableMapping[str, Tensor], ABC):
| ...
| @property
| def read_only(self) -> bool
__iter__¶
class TensorCache(MutableMapping[str, Tensor], ABC):
| ...
| def __iter__(self)
LocalCacheResource¶
class LocalCacheResource:
| def __init__(
| self,
| resource_name: str,
| version: str,
| cache_dir: str = CACHE_DIRECTORY
| ) -> None
This is a context manager that can be used to fetch and cache arbitrary resources locally
using the same mechanisms that cached_path
uses for remote resources.
It can be used, for example, when you want to cache the result of an expensive computation.
Examples¶
with LocalCacheResource("long-computation", "v1") as cache:
if cache.cached():
with cache.reader() as f:
# read from cache
else:
with cache.writer() as f:
# do the computation
# ...
# write to cache
cached¶
class LocalCacheResource:
| ...
| def cached(self) -> bool
writer¶
class LocalCacheResource:
| ...
| @contextmanager
| def writer(self, mode="w")
reader¶
class LocalCacheResource:
| ...
| @contextmanager
| def reader(self, mode="r")
read_set_from_file¶
def read_set_from_file(filename: str) -> Set[str]
Extract a de-duped collection (set) of text from a file. Expected file format is one item per line.
get_file_extension¶
def get_file_extension(path: str, dot=True, lower: bool = True)
open_compressed¶
def open_compressed(
filename: Union[str, PathLike],
mode: str = "rt",
encoding: Optional[str] = "UTF-8",
**kwargs
)
text_lines_from_file¶
def text_lines_from_file(
filename: Union[str, PathLike],
strip_lines: bool = True
) -> Iterator[str]
json_lines_from_file¶
def json_lines_from_file(
filename: Union[str, PathLike]
) -> Iterable[Union[list, dict]]
remove_cache_entries¶
def remove_cache_entries(
patterns: List[str],
cache_dir: Union[str, Path] = None
) -> int
Remove cache entries matching the given patterns.
Returns the total reclaimed space in bytes.
inspect_cache¶
def inspect_cache(
patterns: List[str] = None,
cache_dir: Union[str, Path] = None
)
Print out useful information about the cache directory.
hardlink_or_copy¶
def hardlink_or_copy(source: PathOrStr, dest: PathOrStr)