Source code for cldfbench.metadata

"""
Dataset metadata
"""
import json
import pathlib
from typing import Optional
import collections
import dataclasses

from clldutils import licenses
from clldutils.misc import nfilter
from clldutils.markup import iter_markdown_tables

__all__ = ['Metadata', 'get_creators_and_contributors']

CONTRIBUTOR_TYPES = {
    'ContactPerson',
    'DataCollector',
    'DataCurator',
    'DataManager',
    'Distributor',
    'Editor',
    'Funder',
    'HostingInstitution',
    'Producer',
    'ProjectLeader',
    'ProjectManager',
    'ProjectMember',
    'RegistrationAgency',
    'RegistrationAuthority',
    'RelatedPerson',
    'Researcher',
    'ResearchGroup',
    'RightsHolder',
    'Supervisor',
    'Sponsor',
    'WorkPackageLeader',
    'Other',
}

LICENSES = {
    "AAL",
    "ADSL",
    "AFL-1.1",
    "AFL-3.0",
    "AGPL-1.0-only",
    "AGPL-3.0",
    "AGPL-3.0-only",
    "AGPL-3.0-or-later",
    "AMDPLPA",
    "AML",
    "AMPAS",
    "ANTLR-PD",
    "APL-1.0",
    "APSL-1.0",
    "APSL-1.1",
    "APSL-1.2",
    "APSL-2.0",
    "Adobe-2006",
    "Against-DRM",
    "Aladdin",
    "Apache-1.0",
    "Apache-1.1",
    "Apache-2.0",
    "Artistic-1.0",
    "Artistic-1.0-Perl",
    "Artistic-1.0-cl8",
    "Artistic-2.0",
    "BSD-1-Clause",
    "BSD-2-Clause",
    "BSD-2-Clause-FreeBSD",
    "BSD-3-Clause",
    "BSD-3-Clause-Clear",
    "BSD-3-Clause-LBNL",
    "BSD-3-Clause-No-Nuclear-License",
    "BSD-3-Clause-No-Nuclear-License-2014",
    "BSD-4-Clause",
    "BSD-4-Clause-UC",
    "BSD-Source-Code",
    "BSL-1.0",
    "Bahyph",
    "Barr",
    "Beerware",
    "BitTorrent-1.0",
    "BitTorrent-1.1",
    "CATOSL-1.1",
    "CC-BY-1.0",
    "CC-BY-3.0",
    "CC-BY-4.0",
    "CC-BY-NC-1.0",
    "CC-BY-NC-2.5",
    "CC-BY-NC-3.0",
    "CC-BY-NC-4.0",
    "CC-BY-NC-ND-1.0",
    "CC-BY-NC-ND-2.0",
    "CC-BY-NC-ND-2.5",
    "CC-BY-NC-ND-3.0",
    "CC-BY-NC-ND-4.0",
    "CC-BY-NC-SA-1.0",
    "CC-BY-NC-SA-3.0",
    "CC-BY-NC-SA-4.0",
    "CC-BY-ND-1.0",
    "CC-BY-ND-2.0",
    "CC-BY-ND-2.5",
    "CC-BY-ND-4.0",
    "CC-BY-SA-2.0",
    "CC-BY-SA-2.5",
    "CC-BY-SA-3.0",
    "CC-BY-SA-4.0",
    "CC0-1.0",
    "CDDL-1.0",
    "CDLA-Permissive-1.0",
    "CDLA-Sharing-1.0",
    "CECILL-1.1",
    "CECILL-2.0",
    "CECILL-2.1",
    "CECILL-B",
    "CECILL-C",
    "CNRI-Jython",
    "CNRI-Python",
    "CNRI-Python-GPL-Compatible",
    "CPAL-1.0",
    "CPOL-1.02",
    "CUA-OPL-1.0",
    "Caldera",
    "ClArtistic",
    "Condor-1.1",
    "CrystalStacker",
    "Cube",
    "D-FSL-1.0",
    "DSDP",
    "DSL",
    "ECL-2.0",
    "EFL-1.0",
    "EFL-2.0",
    "EPL-1.0",
    "EUDatagrid",
    "EUPL-1.0",
    "EUPL-1.1",
    "EUPL-1.2",
    "Entessa",
    "ErlPL-1.1",
    "Eurosym",
    "FAL-1.3",
    "FSFAP",
    "Fair",
    "Frameworx-1.0",
    "GFDL-1.1",
    "GFDL-1.1-only",
    "GFDL-1.2",
    "GFDL-1.2-only",
    "GFDL-1.2-or-later",
    "GFDL-1.3-no-cover-texts-no-invariant-sections",
    "GL2PS",
    "GPL-1.0+",
    "GPL-1.0-or-later",
    "GPL-2.0",
    "GPL-2.0+",
    "GPL-2.0-with-GCC-exception",
    "GPL-2.0-with-bison-exception",
    "GPL-2.0-with-classpath-exception",
    "GPL-3.0",
    "GPL-3.0-only",
    "GPL-3.0-or-later",
    "GPL-3.0-with-GCC-exception",
    "Giftware",
    "Glulxe",
    "HPND",
    "HaskellReport",
    "IBM-pibs",
    "ICU",
    "IJG",
    "IPA",
    "IPL-1.0",
    "ISC",
    "ImageMagick",
    "Imlib2",
    "Intel",
    "Intel-ACPI",
    "JSON",
    "LGPL-2.0",
    "LGPL-2.0-or-later",
    "LGPL-2.1",
    "LGPL-2.1-only",
    "LGPL-3.0",
    "LGPL-3.0-or-later",
    "LGPLLR",
    "LPL-1.0",
    "LPL-1.02",
    "LPPL-1.0",
    "LPPL-1.2",
    "LPPL-1.3c",
    "LiLiQ-R-1.1",
    "LiLiQ-Rplus-1.1",
    "Linux-OpenIB",
    "MIT",
    "MIT-advertising",
    "MIT-enna",
    "MPL-1.0",
    "MPL-1.1",
    "MPL-2.0",
    "MPL-2.0-no-copyleft-exception",
    "MS-PL",
    "MS-RL",
    "MirOS",
    "Motosoto",
    "Multics",
    "Mup",
    "NASA-1.3",
    "NCSA",
    "NGPL",
    "NOSL",
    "NPL-1.1",
    "NPOSL-3.0",
    "NTP",
    "Naumen",
    "Newsletr",
    "Nokia",
    "Noweb",
    "Nunit",
    "OCCT-PL",
    "OCLC-2.0",
    "ODC-By-1.0",
    "ODC-PDDL-1.0",
    "ODbL-1.0",
    "OFL-1.0",
    "OFL-1.1",
    "OGL-Canada-2.0",
    "OGL-UK-1.0",
    "OGL-UK-2.0",
    "OGL-UK-3.0",
    "OGTSL",
    "OLDAP-1.2",
    "OLDAP-1.3",
    "OLDAP-1.4",
    "OLDAP-2.0",
    "OLDAP-2.0.1",
    "OLDAP-2.1",
    "OLDAP-2.2",
    "OLDAP-2.2.1",
    "OLDAP-2.2.2",
    "OLDAP-2.3",
    "OLDAP-2.4",
    "OLDAP-2.6",
    "OLDAP-2.8",
    "OSET-PL-2.1",
    "OSL-1.0",
    "OSL-1.1",
    "OSL-2.0",
    "OSL-2.1",
    "OSL-3.0",
    "OpenSSL",
    "PHP-3.0",
    "PHP-3.01",
    "Plexus",
    "PostgreSQL",
    "Python-2.0",
    "QPL-1.0",
    "Qhull",
    "RHeCos-1.1",
    "RPL-1.1",
    "RPL-1.5",
    "RPSL-1.0",
    "RSA-MD",
    "RSCPL",
    "Ruby",
    "SAX-PD",
    "SCEA",
    "SGI-B-2.0",
    "SISSL",
    "SMLNJ",
    "SPL-1.0",
    "SWL",
    "Sendmail",
    "Sendmail-8.23",
    "SimPL-2.0",
    "Sleepycat",
    "Spencer-94",
    "Spencer-99",
    "SugarCRM-1.1.3",
    "TCL",
    "TCP-wrappers",
    "TOSL",
    "TU-Berlin-2.0",
    "Unicode-DFS-2015",
    "Unicode-TOU",
    "Unlicense",
    "VSL-1.0",
    "Vim",
    "W3C",
    "W3C-20150513",
    "Watcom-1.0",
    "X11",
    "XFree86-1.1",
    "XSkat",
    "Xerox",
    "Xnet",
    "ZPL-1.1",
    "ZPL-2.0",
    "Zed",
    "Zend-2.0",
    "Zimbra-1.3",
    "Zlib",
    "bsd-license",
    "bzip2-1.0.5",
    "canada-crown",
    "cc-nc",
    "curl",
    "diffmark",
    "dli-model-use",
    "dvipdfm",
    "eCos-2.0",
    "eGenix",
    "eurofound",
    "geo-no-fee-unrestricted",
    "geogratis",
    "gnuplot",
    "hesa-withrights",
    "jabber-osl",
    "libtiff",
    "localauth-withrights",
    "lucent-plan9",
    "met-office-cp",
    "mitre",
    "mpich2",
    "notspecified",
    "other-at",
    "other-closed",
    "other-nc",
    "other-open",
    "other-pd",
    "psfrag",
    "psutils",
    "ukclickusepsi",
    "ukcrown",
    "ukcrown-withrights",
    "ukpsi",
    "user-jsim",
    "wxWindows",
    "xpp",
    "zlib-acknowledgement",
}


[docs]@dataclasses.dataclass class Metadata: """ Dataset metadata is used as follows: - it is (partly) elicited when creating a new dataset directory ... - ... and subsequently written to the directory ... - ... where it may be edited ("by hand") ... - ... and from where it is read when initializing a `Dataset` object. To add custom metadata fields for a dataset, - inherit from `Metadata`, - add more `attr.ib` s, - register the subclass with the dataset by assigning it to `cldfbench.Dataset.metadata_cls`. """ id: str = dataclasses.field( default=None, metadata=dict(elicit=True, required=True)) # pylint: disable=R1735 title: str = dataclasses.field( default=None, metadata=dict(elicit=True, required=True)) # pylint: disable=R1735 description: str = dataclasses.field( default=None) license: str = dataclasses.field( default=None, metadata=dict(elicit=True, required=True)) # pylint: disable=R1735 url: str = dataclasses.field( default=None, metadata=dict(elicit=True)) # pylint: disable=R1735 citation: str = dataclasses.field( default=None, metadata=dict(elicit=True, required=True)) # pylint: disable=R1735 @classmethod def elicit(cls) -> 'Metadata': """ Factory method, called when creating a new dataset directory. """ kw = {} for field in dataclasses.fields(cls): if field.metadata.get('elicit', False): res = input(f'{field.name}: ') if (not res) and field.default: res = field.default kw[field.name] = res return cls(**kw) @classmethod def from_file(cls, fname: pathlib.Path) -> 'Metadata': """ Factory method, called when instantiating a `Dataset` object. """ with fname.open('r', encoding='utf-8') as fp: try: fields = {f.name for f in dataclasses.fields(cls)} return cls(**{k: v for k, v in json.load(fp).items() if k in fields}) except json.decoder.JSONDecodeError as e: # pragma: no cover raise ValueError(f'Invalid JSON file: {fname.resolve()}\n{e}') from e def write(self, fname: pathlib.Path): """Dump the metadata as JSON to disk.""" with fname.open('w', encoding='utf-8') as fp: return json.dump(dataclasses.asdict(self), fp, indent=4) @property def known_license(self) -> Optional[licenses.License]: """ A known license - if one can be matched to self.license. """ if self.license: return licenses.find(self.license) return None # pragma: no cover @property def zenodo_license(self) -> Optional[str]: """A license ID suitable for inclusion in metadata for Zenodo.""" if self.known_license and self.known_license.id in LICENSES: return self.known_license.id return None # pragma: no cover def common_props(self) -> collections.OrderedDict[str, str]: """ The metadata as JSON-LD object suitable for inclusion in CLDF metadata. """ res = collections.OrderedDict() if self.title: res["dc:title"] = self.title if self.description: res["dc:description"] = self.description if self.citation: res["dc:bibliographicCitation"] = self.citation if self.url: res["dc:identifier"] = self.url if self.known_license: res['dc:license'] = self.known_license.url elif self.license: res['dc:license'] = self.license return res def markdown(self) -> str: """A human-readable version of the metadata formatted as Markdown.""" lines = [ '# ' + (self.title or f'Dataset {self.id}') + '\n', '## How to cite\n\nIf you use these data please cite', ] if self.citation: lines.append('- the original source') lines.extend([f" > {line}" for line in self.citation.split('\n')]) lines.extend([ "- the derived dataset using the DOI of the " "[particular released version](../../releases/) you were using" ]) else: # pragma: no cover lines.extend([ "this dataset using the DOI of the " "[particular released version](../../releases/) you were using" ]) lines.append('\n## Description\n\n') if self.description: lines.append(f'{self.description}\n') if self.license: lines.append(f'This dataset is licensed under a {self.license} license\n') if self.url: lines.append(f'Available online at {self.url}\n') return '\n'.join(lines)
TableRowsType = list[dict[str, str]] def get_creators_and_contributors( text: str, strict: bool = True, ) -> tuple[TableRowsType, TableRowsType]: """Read contributor info from a markdown formatted table.""" ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES} creators, contributors = [], [] # Read first table in CONTRIBUTORS.md try: header, rows = next(iter_markdown_tables(text)) except StopIteration: # pragma: no cover return creators, contributors for row in rows: row = {k.lower(): v for k, v in zip(header, row)} for role in nfilter([r.strip().lower() for r in row.get('role', '').split(',')]): c = {k: v for k, v in row.items() if k != 'role'} if role in {'author', 'creator', 'maintainer'}: if c not in creators: creators.append(c) else: if strict: c['type'] = ctypes[role] else: c['type'] = ctypes.get(role, 'Other') if c not in contributors: contributors.append(c) return creators, contributors