Source code for cldfbench.dataset

"""
A cldfbench Dataset provides scaffolding to automatically create one or more CLDF Datasets.
"""
import sys
from typing import Union, Optional
import inspect
import pathlib
import logging
import argparse
import functools
import importlib
import subprocess
from collections.abc import Generator

import pycldf
from clldutils.path import sys_path
from clldutils.misc import nfilter
from cldfcatalog import Repository

from cldfbench.cldf import CLDFSpec, CLDFWriter
from cldfbench.datadir import DataDir
from cldfbench.metadata import Metadata
from cldfbench.ci import build_status_badge
from cldfbench.util import get_entrypoints
from cldfbench._compat import utcnow

__all__ = ['iter_datasets', 'get_dataset', 'get_datasets', 'Dataset', 'ENTRY_POINT']
ENTRY_POINT = 'cldfbench.dataset'
NOOP = -1
PathType = Union[str, pathlib.Path]
SpecDictKeyType = Union[str, None]
SpecDictType = dict[SpecDictKeyType, CLDFSpec]


[docs]class Dataset:
    """
    A cldfbench dataset ties together

    - `raw` data, to be used as source for the
    - `cldf` data, which is created using config data from
    - `etc`.

    To use the cldfbench infrastructure, one should sub-class `Dataset`.

    cldfbench supports the following workflow:
    - a `download` command populates a `Dataset`'s `raw` directory.
    - a `makecldf` command (re)creates the CLDF dataset in `cldf`.

    The following class attributes are supposed to be overwritten by subclasses:

    :ivar dir: `pathlib.Path` pointing to the root directory of the dataset.
    :ivar id: A `str` identifier for the dataset. No assumption about uniqueness properties of \
    this identifier is made.
    :ivar metadata_cls: Subclass of :class:`Metadata` (or :class:`Metadata` if not overwritten)
    """
    dir = None
    id = None
    metadata_cls = Metadata
    datadir_cls = DataDir

    def __init__(self):
        if not self.dir:
            self.dir = pathlib.Path(inspect.getfile(self.__class__)).parent
        self.dir = self.datadir_cls(self.dir)
        md = self.dir / 'metadata.json'
        self.metadata = self.metadata_cls.from_file(md) if md.exists() else self.metadata_cls()
        self.metadata.id = self.id

    def __str__(self):
        return f'{self.__class__.__name__} "{self.id}" at {self.dir.resolve()}'

    @functools.cached_property
    def cldf_dir(self) -> DataDir:
        """
        Directory where CLDF data generated from the Dataset will be stored (unless specified
        differently by a :class:`CLDFSpec`).
        """
        return self.dir / 'cldf'

    @functools.cached_property
    def raw_dir(self) -> DataDir:
        """
        Directory where cldfbench expects the raw or source data.
        """
        return self.dir / 'raw'

    @functools.cached_property
    def etc_dir(self) -> DataDir:
        """
        Directory where cldfbench expects additional configuration or metadata.
        """
        return self.dir / 'etc'

[docs]    def cldf_specs(self) -> Union[CLDFSpec, SpecDictType]:
        """
        A `Dataset` must declare all CLDF datasets that are derived from it.

        :return: A single :class:`CLDFSpec` instance, or a `dict`, mapping names to `CLDFSpec` \
        instances, where the name will be used by `cldf_reader`/`cldf_writer` to look up \
        the spec.
        """
        return CLDFSpec(dir=self.cldf_dir)

    @property
    def cldf_specs_dict(self) -> SpecDictType:
        """
        Turn :meth:`cldf_specs` into a `dict` for simpler lookup.

        :return: `dict` mapping lookup keys to `CLDFSpec` instances.
        """
        specs = self.cldf_specs()
        if isinstance(specs, CLDFSpec):
            return {None: specs}
        assert isinstance(specs, dict)
        return specs

[docs]    def update_submodules(self):
        """
        Convenience method to be used in a `Dataset`'s `cmd_download` to update raw data curated
        as git submodules.
        """
        subprocess.check_call(f'git -C {self.dir.resolve()} submodule update --remote', shell=True)

[docs]    def cldf_writer(
            self,
            args: argparse.Namespace,
            cldf_spec: Union[CLDFSpec, SpecDictKeyType] = None,
            clean: bool = True,
    ) -> CLDFWriter:
        """
        :param args: Namespace passed in when initializing the `CLDFWriter` instance.
        :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs`
        :param clean: `bool` flag signaling whether to clean the CLDF dir before writing. \
        Note that `False` must be passed for subsequent calls to `cldf_writer` in case the \
        spec re-uses a directory.
        :return: a `cldf_spec.writer_cls` instance, for write-access to CLDF data. \
        This method should be used in a with-statement, and will then return a `CLDFWriter` with \
        an empty working directory.
        """
        if not isinstance(cldf_spec, CLDFSpec):
            cldf_spec = self.cldf_specs_dict[cldf_spec]
        return cldf_spec.get_writer(args=args, dataset=self, clean=clean)

[docs]    def cldf_reader(
            self,
            cldf_spec: Union[CLDFSpec, SpecDictKeyType] = None,
    ) -> pycldf.Dataset:
        """
        :param cldf_spec: Key of the relevant `CLDFSpec` in `Dataset.cldf_specs`.
        :return: a `pycldf.Dataset` instance, for read-access to the CLDF data.
        """
        if not isinstance(cldf_spec, CLDFSpec):
            cldf_spec = self.cldf_specs_dict[cldf_spec]
        return cldf_spec.get_dataset()

    @functools.cached_property
    def repo(self) -> Optional[Repository]:
        """
        The git repository cloned to the dataset's directory (or `None`).
        """
        try:
            return Repository(self.dir)
        except ValueError:  # pragma: no cover
            return None

    def _cmd_download(self, args):
        self.raw_dir.mkdir(exist_ok=True)
        self.cmd_download(args)
        (self.raw_dir / 'README.md').write_text(
            f'Raw data downloaded {utcnow().isoformat()}', encoding='utf8')

[docs]    def cmd_download(self, args: argparse.Namespace):
        """
        Implementations of this methods should populate the dataset's `raw_dir` with the source
        data.
        """
        args.log.warning('cmd_download not implemented for dataset %s', self.id)
        return NOOP

    def _cmd_readme(self, args: argparse.Namespace):
        if self.metadata:
            badge = build_status_badge(self)
            md = self.cmd_readme(args)
            if badge:
                lines, title_found = [], False
                for line in md.split('\n'):
                    lines.append(line)
                    if line.startswith('# ') and not title_found:
                        title_found = True
                        lines.extend(['', badge])
                md = '\n'.join(lines)

            rel_cldf_dir = self.cldf_dir.resolve().relative_to(self.dir.resolve())
            section = [
                '\n\n## CLDF Datasets\n',
                f'The following CLDF datasets are available in [{rel_cldf_dir}]({rel_cldf_dir}):\n'
            ]
            for ds in self.cldf_specs_dict.values():
                if ds.metadata_path.exists():
                    rel_p = ds.metadata_path.resolve().relative_to(self.dir.resolve())
                    module_link = (f'[{ds.module}](https://github.com/cldf/cldf/tree/master'
                                   f'/modules/{ds.module})')
                    section.append(f'- CLDF {module_link} at [{rel_p}]({rel_p})')

            self.dir.joinpath('README.md').write_text(md + '\n'.join(section), encoding='utf8')

[docs]    def cmd_readme(self, _: argparse.Namespace) -> str:
        """
        Implementations of this method should create the content for the dataset's README.md
        and return it as markdown formatted string.
        """
        return self.metadata.markdown() if self.metadata else ''

    def _cmd_makecldf(self, args):
        specs = list(self.cldf_specs_dict.values())
        if len(specs) == 1:
            # There's only one CLDF spec! We instantiate the writer now and inject it into `args`:
            with self.cldf_writer(args, cldf_spec=specs[0]) as writer:
                args.writer = writer
                self.cmd_makecldf(args)
        else:
            self.cmd_makecldf(args)

        if self.metadata and self.metadata.known_license:
            legalcode = self.metadata.known_license.legalcode
            if legalcode:
                (self.dir / 'LICENSE').write_text(legalcode, encoding='utf8')

[docs]    def cmd_makecldf(self, args: argparse.Namespace):
        """
        Implementations of this method should write the CLDF data curated by the dataset.

        :param args: An `argparse.Namespace` including attributes: \
        - `writer`: :class:`CLDFWriter` instance
        """
        args.log.warning('cmd_makecldf not implemented for dataset %s', self.id)
        return NOOP


[docs]def iter_datasets(ep: str = ENTRY_POINT) -> Generator[Dataset, None, None]:
    """
    Yields `Dataset` instances registered for the specified entry point.

    :param ep: Name of the entry point.
    """
    for p in get_entrypoints(ep):
        try:
            cls = p.load()
            yield cls()  # yield an initialized `Dataset` object.
        except ImportError as e:  # pragma: no cover
            logging.getLogger('cldfbench').warning('Error importing %s: %s', p.name, e)


[docs]def get_dataset(spec, ep: str = ENTRY_POINT) -> Optional[Dataset]:
    """
    Get an initialised `Dataset` instance.

    :param spec: Specification of the dataset, either an ID or a path to a Python module \
    containing a subclass of :class:`Dataset`.
    """
    # First assume `spec` is the ID of an installed dataset:
    # iterate over registered entry points
    for ds in iter_datasets(ep=ep):
        if ds.id == spec:
            return ds

    # Then check whether `spec` points to a python module:
    # `Dataset` subclass found in the module:
    ds = dataset_from_module(spec)
    if ds:
        return ds
    return None


[docs]def get_datasets(spec, ep=ENTRY_POINT, glob: bool = False) -> list[Dataset]:
    """
    :param spec: Either `'*'` to get all datasets for a specific entry point, or glob pattern \
    matching dataset modules in the current directory (if `glob == True`), or a `str` as accepted \
    by :func:`get_dataset`.
    """
    if spec == '*':
        return list(iter_datasets(ep))
    if glob:
        return nfilter(dataset_from_module(p) for p in pathlib.Path('.').glob(spec))
    return nfilter([get_dataset(spec, ep=ep)])


def dataset_from_module(path: PathType) -> Optional[Dataset]:
    """
    load the first `Dataset` subclass found in the module which does not have any subclasses.
    """
    path = pathlib.Path(path)
    if path.exists() and path.is_file():
        with sys_path(path.parent):
            if path.stem in sys.modules:
                mod = importlib.reload(sys.modules[path.stem])
            else:
                mod = importlib.import_module(path.stem)

        for _, obj in inspect.getmembers(mod):
            if inspect.isclass(obj) and issubclass(obj, Dataset) and not obj.__subclasses__():
                return obj()
    return None