Source code for cldfbench.cldf

import sys
import shutil
import pathlib
import warnings
import collections

import attr
from csvw.metadata import Link
import pycldf
from pycldf.dataset import get_modules, MD_SUFFIX, Dataset
from pycldf.util import pkg_path
from cldfcatalog import Repository

from cldfbench.catalogs import Catalog
from cldfbench.util import iter_requirements

__all__ = ['CLDFWriter', 'CLDFSpec']
WITH_ZIPPED = tuple(map(int, pycldf.__version__.split('.')[:2])) >= (1, 29)


[docs]class CLDFWriter(object): """ An object mediating writing data as proper CLDF dataset. Implements a context manager which upon exiting will write all objects acquired within the context to disk. :ivar cldf_spec: :class:`CLDFSpec` instance, configuring the CLDF dataset written by the writer. :ivar objects: `dict` of `list` s to collect the data items. Will be passed as kwargs to \ `pycldf.Dataset.write`. Usage: .. code-block:: python >>> with Writer(cldf_spec) as writer: ... writer.objects['ValueTable'].append(...) """ def __init__(self, cldf_spec=None, args=None, dataset=None, clean=True): """ :param cldf_spec: `CLDFSpec` instance :param args: `argparse.Namespace`, passed if the writer is instantiated from a cli command. :param dataset: `cldfbench.Dataset`, passed if instantiated from a dataset method. :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. """ self.cldf_spec = cldf_spec or CLDFSpec(dir=getattr(dataset, 'cldf_dir', '.')) self.objects = collections.defaultdict(list) self.args = args self.dataset = dataset self._cldf = None self._clean = clean @property def cldf(self) -> pycldf.Dataset: """ The `pycldf.Dataset` used to write the data. :raises AttributeError: If accessed outside of the context managed by this writer. """ if self._cldf is None: raise AttributeError('Writer.cldf is only set when Writer is used in with statement!') return self._cldf def __getitem__(self, type_): """ Mirrors `pycldf.Dataset.__getitem__` """ return self.cldf[type_] def __enter__(self): """ Upon entering the writer context - the target directory is cleaned up, - the CLDF metadata is initialized and - provided as attribute `cldf`. Within the context, - the CLDF schema can be manipulated via `CLDFWriter.cldf`, see \ `<https://pycldf.readthedocs.io/en/latest/dataset.html#editing-metadata-and-schema>` - sources can be added, see \ `<https://pycldf.readthedocs.io/en/latest/dataset.html#adding-data>` - data items can be appended to `self.objects`. """ if self._clean: self.cldf_spec.make_clean() self.cldf_spec.copy_metadata() self._cldf = self.cldf_spec.get_dataset() for comp, fname in self.cldf_spec.data_fnames.items(): try: t = self._cldf[comp] t.url = Link(fname) except KeyError: self._cldf.add_component(comp, url=fname) return self def __exit__(self, exc_type, exc_val, exc_tb): """ When exiting the writer context, write data (and metadata) to disk. """ if WITH_ZIPPED: self.write(zipped=self.cldf_spec.zipped, **self.objects) else: # pragma: no cover self.write(**self.objects) def write(self, **kw): self.cldf.properties.setdefault('rdf:type', 'http://www.w3.org/ns/dcat#Distribution') srcs = [] # Let's see whether self.dataset is repository: if self.dataset: self.cldf.properties.setdefault('rdf:ID', self.dataset.id) for k, v in self.dataset.metadata.common_props().items(): self.cldf.properties.setdefault(k, v) if self.dataset.repo: if self.dataset.repo.url: self.cldf.properties.setdefault('dcat:accessURL', self.dataset.repo.url) srcs.append(self.dataset.repo.json_ld()) if self.args: # We inspect the cli arguments to see whether some `Catalog`'s were used. for cat in vars(self.args).values(): if isinstance(cat, Catalog): srcs.append(cat.json_ld()) # And check, whether any repositories have been "mounted" via git submodules in raw/: if self.dataset and self.dataset.raw_dir.exists(): for p in self.dataset.raw_dir.iterdir(): if p.is_dir(): try: repo = Repository(p) except ValueError: continue srcs.append(repo.json_ld()) if srcs: self.cldf.add_provenance(wasDerivedFrom=srcs) reqs = [ collections.OrderedDict([ ('dc:title', "python"), ('dc:description', sys.version.split()[0])])] try: self.cldf_spec.dir.joinpath('requirements.txt').write_text( '\n'.join(iter_requirements()), encoding='utf8') reqs.append( collections.OrderedDict([ ('dc:title', "python-packages"), ('dc:relation', 'requirements.txt')])) except ValueError: # pragma: no cover pass self.cldf.add_provenance(wasGeneratedBy=reqs) self.cldf.write(**kw)
[docs]@attr.s class CLDFSpec(object): """ Basic specification to initialize a CLDF Dataset. :ivar dir: A directory where the CLDF data is located. :ivar module: `pycldf.Dataset` subclass or name of a CLDF module :ivar default_metadata_path: Path to the source file for the default metadata for a dataset. :ivar metadata_fname: Filename to be used for the actual copy of the metadata. :ivar data_fnames: A `dict` mapping component names to custom csv file names (which may be \ important if multiple different CLDF datasets are created in the same directory). :ivar writer_cls: `CLDFWriter` subclass to use for writing the data. :ivar zipped: An `iterable` listing component names or csv file names for which the \ corresponding tables should be zipped. """ dir = attr.ib(converter=lambda s: pathlib.Path(s) if s else s) module = attr.ib( default='Generic', converter=lambda cls: getattr(cls, '__name__', cls), validator=attr.validators.in_([m.id for m in get_modules()]) ) default_metadata_path = attr.ib(default=None) metadata_fname = attr.ib(default=None) data_fnames = attr.ib(default=attr.Factory(dict)) writer_cls = attr.ib(default=CLDFWriter) zipped = attr.ib(default=attr.Factory(set)) def __attrs_post_init__(self): if self.zipped and not WITH_ZIPPED: # pragma: no cover warnings.warn('Writing zipped tables requires pycldf >= 1.29', category=UserWarning) if self.default_metadata_path: self.default_metadata_path = pathlib.Path(self.default_metadata_path) try: Dataset.from_metadata(self.default_metadata_path) except Exception: raise ValueError('invalid default metadata: {0}'.format(self.default_metadata_path)) else: self.default_metadata_path = pkg_path( 'modules', '{0}{1}'.format(self.module, MD_SUFFIX)) if not self.metadata_fname: self.metadata_fname = self.default_metadata_path.name @property def metadata_path(self): return (self.dir / self.metadata_fname) if self.dir else pathlib.Path(self.metadata_fname) def make_clean(self): self.dir.mkdir(exist_ok=True) for p in self.dir.iterdir(): if p.is_file() and p.name not in ['.gitattributes', 'README.md']: p.unlink() gitattributes = self.dir / '.gitattributes' if not gitattributes.exists(): with gitattributes.open('wt') as fp: fp.write('*.csv text eol=crlf') def copy_metadata(self): shutil.copy(str(self.default_metadata_path), str(self.metadata_path)) def get_dataset(self): # Initialize a CLDF Dataset: return self.cls.from_metadata(self.metadata_path) def get_writer(self, args=None, dataset=None, clean=True): return self.writer_cls(cldf_spec=self, args=args, dataset=dataset, clean=clean) @property def cls(self): for m in get_modules(): if m.id == self.module: return m.cls