Source code for cldfbench.metadata

"""
Dataset metadata
"""
import json
import pathlib
from typing import Optional
import collections
import dataclasses

from clldutils import licenses
from clldutils.misc import nfilter
from clldutils.markup import iter_markdown_tables

__all__ = ['Metadata', 'get_creators_and_contributors']

CONTRIBUTOR_TYPES = {
    'ContactPerson',
    'DataCollector',
    'DataCurator',
    'DataManager',
    'Distributor',
    'Editor',
    'Funder',
    'HostingInstitution',
    'Producer',
    'ProjectLeader',
    'ProjectManager',
    'ProjectMember',
    'RegistrationAgency',
    'RegistrationAuthority',
    'RelatedPerson',
    'Researcher',
    'ResearchGroup',
    'RightsHolder',
    'Supervisor',
    'Sponsor',
    'WorkPackageLeader',
    'Other',
}

LICENSES = {
    "AAL",
    "ADSL",
    "AFL-1.1",
    "AFL-3.0",
    "AGPL-1.0-only",
    "AGPL-3.0",
    "AGPL-3.0-only",
    "AGPL-3.0-or-later",
    "AMDPLPA",
    "AML",
    "AMPAS",
    "ANTLR-PD",
    "APL-1.0",
    "APSL-1.0",
    "APSL-1.1",
    "APSL-1.2",
    "APSL-2.0",
    "Adobe-2006",
    "Against-DRM",
    "Aladdin",
    "Apache-1.0",
    "Apache-1.1",
    "Apache-2.0",
    "Artistic-1.0",
    "Artistic-1.0-Perl",
    "Artistic-1.0-cl8",
    "Artistic-2.0",
    "BSD-1-Clause",
    "BSD-2-Clause",
    "BSD-2-Clause-FreeBSD",
    "BSD-3-Clause",
    "BSD-3-Clause-Clear",
    "BSD-3-Clause-LBNL",
    "BSD-3-Clause-No-Nuclear-License",
    "BSD-3-Clause-No-Nuclear-License-2014",
    "BSD-4-Clause",
    "BSD-4-Clause-UC",
    "BSD-Source-Code",
    "BSL-1.0",
    "Bahyph",
    "Barr",
    "Beerware",
    "BitTorrent-1.0",
    "BitTorrent-1.1",
    "CATOSL-1.1",
    "CC-BY-1.0",
    "CC-BY-3.0",
    "CC-BY-4.0",
    "CC-BY-NC-1.0",
    "CC-BY-NC-2.5",
    "CC-BY-NC-3.0",
    "CC-BY-NC-4.0",
    "CC-BY-NC-ND-1.0",
    "CC-BY-NC-ND-2.0",
    "CC-BY-NC-ND-2.5",
    "CC-BY-NC-ND-3.0",
    "CC-BY-NC-ND-4.0",
    "CC-BY-NC-SA-1.0",
    "CC-BY-NC-SA-3.0",
    "CC-BY-NC-SA-4.0",
    "CC-BY-ND-1.0",
    "CC-BY-ND-2.0",
    "CC-BY-ND-2.5",
    "CC-BY-ND-4.0",
    "CC-BY-SA-2.0",
    "CC-BY-SA-2.5",
    "CC-BY-SA-3.0",
    "CC-BY-SA-4.0",
    "CC0-1.0",
    "CDDL-1.0",
    "CDLA-Permissive-1.0",
    "CDLA-Sharing-1.0",
    "CECILL-1.1",
    "CECILL-2.0",
    "CECILL-2.1",
    "CECILL-B",
    "CECILL-C",
    "CNRI-Jython",
    "CNRI-Python",
    "CNRI-Python-GPL-Compatible",
    "CPAL-1.0",
    "CPOL-1.02",
    "CUA-OPL-1.0",
    "Caldera",
    "ClArtistic",
    "Condor-1.1",
    "CrystalStacker",
    "Cube",
    "D-FSL-1.0",
    "DSDP",
    "DSL",
    "ECL-2.0",
    "EFL-1.0",
    "EFL-2.0",
    "EPL-1.0",
    "EUDatagrid",
    "EUPL-1.0",
    "EUPL-1.1",
    "EUPL-1.2",
    "Entessa",
    "ErlPL-1.1",
    "Eurosym",
    "FAL-1.3",
    "FSFAP",
    "Fair",
    "Frameworx-1.0",
    "GFDL-1.1",
    "GFDL-1.1-only",
    "GFDL-1.2",
    "GFDL-1.2-only",
    "GFDL-1.2-or-later",
    "GFDL-1.3-no-cover-texts-no-invariant-sections",
    "GL2PS",
    "GPL-1.0+",
    "GPL-1.0-or-later",
    "GPL-2.0",
    "GPL-2.0+",
    "GPL-2.0-with-GCC-exception",
    "GPL-2.0-with-bison-exception",
    "GPL-2.0-with-classpath-exception",
    "GPL-3.0",
    "GPL-3.0-only",
    "GPL-3.0-or-later",
    "GPL-3.0-with-GCC-exception",
    "Giftware",
    "Glulxe",
    "HPND",
    "HaskellReport",
    "IBM-pibs",
    "ICU",
    "IJG",
    "IPA",
    "IPL-1.0",
    "ISC",
    "ImageMagick",
    "Imlib2",
    "Intel",
    "Intel-ACPI",
    "JSON",
    "LGPL-2.0",
    "LGPL-2.0-or-later",
    "LGPL-2.1",
    "LGPL-2.1-only",
    "LGPL-3.0",
    "LGPL-3.0-or-later",
    "LGPLLR",
    "LPL-1.0",
    "LPL-1.02",
    "LPPL-1.0",
    "LPPL-1.2",
    "LPPL-1.3c",
    "LiLiQ-R-1.1",
    "LiLiQ-Rplus-1.1",
    "Linux-OpenIB",
    "MIT",
    "MIT-advertising",
    "MIT-enna",
    "MPL-1.0",
    "MPL-1.1",
    "MPL-2.0",
    "MPL-2.0-no-copyleft-exception",
    "MS-PL",
    "MS-RL",
    "MirOS",
    "Motosoto",
    "Multics",
    "Mup",
    "NASA-1.3",
    "NCSA",
    "NGPL",
    "NOSL",
    "NPL-1.1",
    "NPOSL-3.0",
    "NTP",
    "Naumen",
    "Newsletr",
    "Nokia",
    "Noweb",
    "Nunit",
    "OCCT-PL",
    "OCLC-2.0",
    "ODC-By-1.0",
    "ODC-PDDL-1.0",
    "ODbL-1.0",
    "OFL-1.0",
    "OFL-1.1",
    "OGL-Canada-2.0",
    "OGL-UK-1.0",
    "OGL-UK-2.0",
    "OGL-UK-3.0",
    "OGTSL",
    "OLDAP-1.2",
    "OLDAP-1.3",
    "OLDAP-1.4",
    "OLDAP-2.0",
    "OLDAP-2.0.1",
    "OLDAP-2.1",
    "OLDAP-2.2",
    "OLDAP-2.2.1",
    "OLDAP-2.2.2",
    "OLDAP-2.3",
    "OLDAP-2.4",
    "OLDAP-2.6",
    "OLDAP-2.8",
    "OSET-PL-2.1",
    "OSL-1.0",
    "OSL-1.1",
    "OSL-2.0",
    "OSL-2.1",
    "OSL-3.0",
    "OpenSSL",
    "PHP-3.0",
    "PHP-3.01",
    "Plexus",
    "PostgreSQL",
    "Python-2.0",
    "QPL-1.0",
    "Qhull",
    "RHeCos-1.1",
    "RPL-1.1",
    "RPL-1.5",
    "RPSL-1.0",
    "RSA-MD",
    "RSCPL",
    "Ruby",
    "SAX-PD",
    "SCEA",
    "SGI-B-2.0",
    "SISSL",
    "SMLNJ",
    "SPL-1.0",
    "SWL",
    "Sendmail",
    "Sendmail-8.23",
    "SimPL-2.0",
    "Sleepycat",
    "Spencer-94",
    "Spencer-99",
    "SugarCRM-1.1.3",
    "TCL",
    "TCP-wrappers",
    "TOSL",
    "TU-Berlin-2.0",
    "Unicode-DFS-2015",
    "Unicode-TOU",
    "Unlicense",
    "VSL-1.0",
    "Vim",
    "W3C",
    "W3C-20150513",
    "Watcom-1.0",
    "X11",
    "XFree86-1.1",
    "XSkat",
    "Xerox",
    "Xnet",
    "ZPL-1.1",
    "ZPL-2.0",
    "Zed",
    "Zend-2.0",
    "Zimbra-1.3",
    "Zlib",
    "bsd-license",
    "bzip2-1.0.5",
    "canada-crown",
    "cc-nc",
    "curl",
    "diffmark",
    "dli-model-use",
    "dvipdfm",
    "eCos-2.0",
    "eGenix",
    "eurofound",
    "geo-no-fee-unrestricted",
    "geogratis",
    "gnuplot",
    "hesa-withrights",
    "jabber-osl",
    "libtiff",
    "localauth-withrights",
    "lucent-plan9",
    "met-office-cp",
    "mitre",
    "mpich2",
    "notspecified",
    "other-at",
    "other-closed",
    "other-nc",
    "other-open",
    "other-pd",
    "psfrag",
    "psutils",
    "ukclickusepsi",
    "ukcrown",
    "ukcrown-withrights",
    "ukpsi",
    "user-jsim",
    "wxWindows",
    "xpp",
    "zlib-acknowledgement",
}


[docs]@dataclasses.dataclass
class Metadata:
    """
    Dataset metadata is used as follows:

    - it is (partly) elicited when creating a new dataset directory ...
    - ... and subsequently written to the directory ...
    - ... where it may be edited ("by hand") ...
    - ... and from where it is read when initializing a `Dataset` object.

    To add custom metadata fields for a dataset,

    - inherit from `Metadata`,
    - add more `attr.ib` s,
    - register the subclass with the dataset by assigning it to `cldfbench.Dataset.metadata_cls`.
    """
    id: str = dataclasses.field(
        default=None,
        metadata=dict(elicit=True, required=True))  # pylint: disable=R1735
    title: str = dataclasses.field(
        default=None,
        metadata=dict(elicit=True, required=True))  # pylint: disable=R1735
    description: str = dataclasses.field(
        default=None)
    license: str = dataclasses.field(
        default=None,
        metadata=dict(elicit=True, required=True))  # pylint: disable=R1735
    url: str = dataclasses.field(
        default=None,
        metadata=dict(elicit=True))  # pylint: disable=R1735
    citation: str = dataclasses.field(
        default=None,
        metadata=dict(elicit=True, required=True))  # pylint: disable=R1735

    @classmethod
    def elicit(cls) -> 'Metadata':
        """
        Factory method, called when creating a new dataset directory.
        """
        kw = {}
        for field in dataclasses.fields(cls):
            if field.metadata.get('elicit', False):
                res = input(f'{field.name}: ')
                if (not res) and field.default:
                    res = field.default
                kw[field.name] = res
        return cls(**kw)

    @classmethod
    def from_file(cls, fname: pathlib.Path) -> 'Metadata':
        """
        Factory method, called when instantiating a `Dataset` object.
        """
        with fname.open('r', encoding='utf-8') as fp:
            try:
                fields = {f.name for f in dataclasses.fields(cls)}
                return cls(**{k: v for k, v in json.load(fp).items() if k in fields})
            except json.decoder.JSONDecodeError as e:  # pragma: no cover
                raise ValueError(f'Invalid JSON file: {fname.resolve()}\n{e}') from e

    def write(self, fname: pathlib.Path):
        """Dump the metadata as JSON to disk."""
        with fname.open('w', encoding='utf-8') as fp:
            return json.dump(dataclasses.asdict(self), fp, indent=4)

    @property
    def known_license(self) -> Optional[licenses.License]:
        """
        A known license - if one can be matched to self.license.
        """
        if self.license:
            return licenses.find(self.license)
        return None  # pragma: no cover

    @property
    def zenodo_license(self) -> Optional[str]:
        """A license ID suitable for inclusion in metadata for Zenodo."""
        if self.known_license and self.known_license.id in LICENSES:
            return self.known_license.id
        return None  # pragma: no cover

    def common_props(self) -> collections.OrderedDict[str, str]:
        """
        The metadata as JSON-LD object suitable for inclusion in CLDF metadata.
        """
        res = collections.OrderedDict()
        if self.title:
            res["dc:title"] = self.title
        if self.description:
            res["dc:description"] = self.description
        if self.citation:
            res["dc:bibliographicCitation"] = self.citation
        if self.url:
            res["dc:identifier"] = self.url
        if self.known_license:
            res['dc:license'] = self.known_license.url
        elif self.license:
            res['dc:license'] = self.license
        return res

    def markdown(self) -> str:
        """A human-readable version of the metadata formatted as Markdown."""
        lines = [
            '# ' + (self.title or f'Dataset {self.id}') + '\n',
            '## How to cite\n\nIf you use these data please cite',
        ]
        if self.citation:
            lines.append('- the original source')
            lines.extend([f"  > {line}" for line in self.citation.split('\n')])
            lines.extend([
                "- the derived dataset using the DOI of the "
                "[particular released version](../../releases/) you were using"
            ])
        else:  # pragma: no cover
            lines.extend([
                "this dataset using the DOI of the "
                "[particular released version](../../releases/) you were using"
            ])

        lines.append('\n## Description\n\n')

        if self.description:
            lines.append(f'{self.description}\n')

        if self.license:
            lines.append(f'This dataset is licensed under a {self.license} license\n')

        if self.url:
            lines.append(f'Available online at {self.url}\n')

        return '\n'.join(lines)


TableRowsType = list[dict[str, str]]


def get_creators_and_contributors(
        text: str,
        strict: bool = True,
) -> tuple[TableRowsType, TableRowsType]:
    """Read contributor info from a markdown formatted table."""
    ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES}
    creators, contributors = [], []
    # Read first table in CONTRIBUTORS.md
    try:
        header, rows = next(iter_markdown_tables(text))
    except StopIteration:  # pragma: no cover
        return creators, contributors
    for row in rows:
        row = {k.lower(): v for k, v in zip(header, row)}
        for role in nfilter([r.strip().lower() for r in row.get('role', '').split(',')]):
            c = {k: v for k, v in row.items() if k != 'role'}
            if role in {'author', 'creator', 'maintainer'}:
                if c not in creators:
                    creators.append(c)
            else:
                if strict:
                    c['type'] = ctypes[role]
                else:
                    c['type'] = ctypes.get(role, 'Other')
                if c not in contributors:
                    contributors.append(c)
    return creators, contributors