Source code for cldfbench.metadata

"""
Dataset metadata
"""
import json
import collections
import pathlib
import typing

import attr
from clldutils import licenses
from clldutils.misc import nfilter
from clldutils.markup import iter_markdown_tables

__all__ = ['Metadata', 'get_creators_and_contributors']

CONTRIBUTOR_TYPES = {
    'ContactPerson',
    'DataCollector',
    'DataCurator',
    'DataManager',
    'Distributor',
    'Editor',
    'Funder',
    'HostingInstitution',
    'Producer',
    'ProjectLeader',
    'ProjectManager',
    'ProjectMember',
    'RegistrationAgency',
    'RegistrationAuthority',
    'RelatedPerson',
    'Researcher',
    'ResearchGroup',
    'RightsHolder',
    'Supervisor',
    'Sponsor',
    'WorkPackageLeader',
    'Other',
}

LICENSES = {
    "AAL",
    "ADSL",
    "AFL-1.1",
    "AFL-3.0",
    "AGPL-1.0-only",
    "AGPL-3.0",
    "AGPL-3.0-only",
    "AGPL-3.0-or-later",
    "AMDPLPA",
    "AML",
    "AMPAS",
    "ANTLR-PD",
    "APL-1.0",
    "APSL-1.0",
    "APSL-1.1",
    "APSL-1.2",
    "APSL-2.0",
    "Adobe-2006",
    "Against-DRM",
    "Aladdin",
    "Apache-1.0",
    "Apache-1.1",
    "Apache-2.0",
    "Artistic-1.0",
    "Artistic-1.0-Perl",
    "Artistic-1.0-cl8",
    "Artistic-2.0",
    "BSD-1-Clause",
    "BSD-2-Clause",
    "BSD-2-Clause-FreeBSD",
    "BSD-3-Clause",
    "BSD-3-Clause-Clear",
    "BSD-3-Clause-LBNL",
    "BSD-3-Clause-No-Nuclear-License",
    "BSD-3-Clause-No-Nuclear-License-2014",
    "BSD-4-Clause",
    "BSD-4-Clause-UC",
    "BSD-Source-Code",
    "BSL-1.0",
    "Bahyph",
    "Barr",
    "Beerware",
    "BitTorrent-1.0",
    "BitTorrent-1.1",
    "CATOSL-1.1",
    "CC-BY-1.0",
    "CC-BY-3.0",
    "CC-BY-4.0",
    "CC-BY-NC-1.0",
    "CC-BY-NC-2.5",
    "CC-BY-NC-3.0",
    "CC-BY-NC-4.0",
    "CC-BY-NC-ND-1.0",
    "CC-BY-NC-ND-2.0",
    "CC-BY-NC-ND-2.5",
    "CC-BY-NC-ND-3.0",
    "CC-BY-NC-ND-4.0",
    "CC-BY-NC-SA-1.0",
    "CC-BY-NC-SA-3.0",
    "CC-BY-NC-SA-4.0",
    "CC-BY-ND-1.0",
    "CC-BY-ND-2.0",
    "CC-BY-ND-2.5",
    "CC-BY-ND-4.0",
    "CC-BY-SA-2.0",
    "CC-BY-SA-2.5",
    "CC-BY-SA-3.0",
    "CC-BY-SA-4.0",
    "CC0-1.0",
    "CDDL-1.0",
    "CDLA-Permissive-1.0",
    "CDLA-Sharing-1.0",
    "CECILL-1.1",
    "CECILL-2.0",
    "CECILL-2.1",
    "CECILL-B",
    "CECILL-C",
    "CNRI-Jython",
    "CNRI-Python",
    "CNRI-Python-GPL-Compatible",
    "CPAL-1.0",
    "CPOL-1.02",
    "CUA-OPL-1.0",
    "Caldera",
    "ClArtistic",
    "Condor-1.1",
    "CrystalStacker",
    "Cube",
    "D-FSL-1.0",
    "DSDP",
    "DSL",
    "ECL-2.0",
    "EFL-1.0",
    "EFL-2.0",
    "EPL-1.0",
    "EUDatagrid",
    "EUPL-1.0",
    "EUPL-1.1",
    "EUPL-1.2",
    "Entessa",
    "ErlPL-1.1",
    "Eurosym",
    "FAL-1.3",
    "FSFAP",
    "Fair",
    "Frameworx-1.0",
    "GFDL-1.1",
    "GFDL-1.1-only",
    "GFDL-1.2",
    "GFDL-1.2-only",
    "GFDL-1.2-or-later",
    "GFDL-1.3-no-cover-texts-no-invariant-sections",
    "GL2PS",
    "GPL-1.0+",
    "GPL-1.0-or-later",
    "GPL-2.0",
    "GPL-2.0+",
    "GPL-2.0-with-GCC-exception",
    "GPL-2.0-with-bison-exception",
    "GPL-2.0-with-classpath-exception",
    "GPL-3.0",
    "GPL-3.0-only",
    "GPL-3.0-or-later",
    "GPL-3.0-with-GCC-exception",
    "Giftware",
    "Glulxe",
    "HPND",
    "HaskellReport",
    "IBM-pibs",
    "ICU",
    "IJG",
    "IPA",
    "IPL-1.0",
    "ISC",
    "ImageMagick",
    "Imlib2",
    "Intel",
    "Intel-ACPI",
    "JSON",
    "LGPL-2.0",
    "LGPL-2.0-or-later",
    "LGPL-2.1",
    "LGPL-2.1-only",
    "LGPL-3.0",
    "LGPL-3.0-or-later",
    "LGPLLR",
    "LPL-1.0",
    "LPL-1.02",
    "LPPL-1.0",
    "LPPL-1.2",
    "LPPL-1.3c",
    "LiLiQ-R-1.1",
    "LiLiQ-Rplus-1.1",
    "Linux-OpenIB",
    "MIT",
    "MIT-advertising",
    "MIT-enna",
    "MPL-1.0",
    "MPL-1.1",
    "MPL-2.0",
    "MPL-2.0-no-copyleft-exception",
    "MS-PL",
    "MS-RL",
    "MirOS",
    "Motosoto",
    "Multics",
    "Mup",
    "NASA-1.3",
    "NCSA",
    "NGPL",
    "NOSL",
    "NPL-1.1",
    "NPOSL-3.0",
    "NTP",
    "Naumen",
    "Newsletr",
    "Nokia",
    "Noweb",
    "Nunit",
    "OCCT-PL",
    "OCLC-2.0",
    "ODC-By-1.0",
    "ODC-PDDL-1.0",
    "ODbL-1.0",
    "OFL-1.0",
    "OFL-1.1",
    "OGL-Canada-2.0",
    "OGL-UK-1.0",
    "OGL-UK-2.0",
    "OGL-UK-3.0",
    "OGTSL",
    "OLDAP-1.2",
    "OLDAP-1.3",
    "OLDAP-1.4",
    "OLDAP-2.0",
    "OLDAP-2.0.1",
    "OLDAP-2.1",
    "OLDAP-2.2",
    "OLDAP-2.2.1",
    "OLDAP-2.2.2",
    "OLDAP-2.3",
    "OLDAP-2.4",
    "OLDAP-2.6",
    "OLDAP-2.8",
    "OSET-PL-2.1",
    "OSL-1.0",
    "OSL-1.1",
    "OSL-2.0",
    "OSL-2.1",
    "OSL-3.0",
    "OpenSSL",
    "PHP-3.0",
    "PHP-3.01",
    "Plexus",
    "PostgreSQL",
    "Python-2.0",
    "QPL-1.0",
    "Qhull",
    "RHeCos-1.1",
    "RPL-1.1",
    "RPL-1.5",
    "RPSL-1.0",
    "RSA-MD",
    "RSCPL",
    "Ruby",
    "SAX-PD",
    "SCEA",
    "SGI-B-2.0",
    "SISSL",
    "SMLNJ",
    "SPL-1.0",
    "SWL",
    "Sendmail",
    "Sendmail-8.23",
    "SimPL-2.0",
    "Sleepycat",
    "Spencer-94",
    "Spencer-99",
    "SugarCRM-1.1.3",
    "TCL",
    "TCP-wrappers",
    "TOSL",
    "TU-Berlin-2.0",
    "Unicode-DFS-2015",
    "Unicode-TOU",
    "Unlicense",
    "VSL-1.0",
    "Vim",
    "W3C",
    "W3C-20150513",
    "Watcom-1.0",
    "X11",
    "XFree86-1.1",
    "XSkat",
    "Xerox",
    "Xnet",
    "ZPL-1.1",
    "ZPL-2.0",
    "Zed",
    "Zend-2.0",
    "Zimbra-1.3",
    "Zlib",
    "bsd-license",
    "bzip2-1.0.5",
    "canada-crown",
    "cc-nc",
    "curl",
    "diffmark",
    "dli-model-use",
    "dvipdfm",
    "eCos-2.0",
    "eGenix",
    "eurofound",
    "geo-no-fee-unrestricted",
    "geogratis",
    "gnuplot",
    "hesa-withrights",
    "jabber-osl",
    "libtiff",
    "localauth-withrights",
    "lucent-plan9",
    "met-office-cp",
    "mitre",
    "mpich2",
    "notspecified",
    "other-at",
    "other-closed",
    "other-nc",
    "other-open",
    "other-pd",
    "psfrag",
    "psutils",
    "ukclickusepsi",
    "ukcrown",
    "ukcrown-withrights",
    "ukpsi",
    "user-jsim",
    "wxWindows",
    "xpp",
    "zlib-acknowledgement",
}


[docs]@attr.s class Metadata(object): """ Dataset metadata is used as follows: - it is (partly) elicited when creating a new dataset directory ... - ... and subsequently written to the directory ... - ... where it may be edited ("by hand") ... - ... and from where it is read when initializing a `Dataset` object. To add custom metadata fields for a dataset, - inherit from `Metadata`, - add more `attr.ib` s, - register the subclass with the dataset by assigning it to `cldfbench.Dataset.metadata_cls`. """ id = attr.ib( default=None, metadata=dict(elicit=True, required=True)) title = attr.ib( default=None, metadata=dict(elicit=True, required=True)) description = attr.ib( default=None) license = attr.ib( default=None, metadata=dict(elicit=True, required=True)) url = attr.ib( default=None, metadata=dict(elicit=True)) citation = attr.ib( default=None, metadata=dict(elicit=True, required=True)) @classmethod def elicit(cls) -> 'Metadata': """ Factory method, called when creating a new dataset directory. """ kw = {} for field in attr.fields(cls): if field.metadata.get('elicit', False): res = input('{0}: '.format(field.name)) if (not res) and field.default is not attr.NOTHING: res = field.default kw[field.name] = res return cls(**kw) @classmethod def from_file(cls, fname: pathlib.Path) -> 'Metadata': """ Factory method, called when instantiating a `Dataset` object. """ with fname.open('r', encoding='utf-8') as fp: try: return cls(**json.load(fp)) except json.decoder.JSONDecodeError as e: # pragma: no cover raise ValueError('Invalid JSON file: {}\n{}'.format(fname.resolve(), e)) def write(self, fname: pathlib.Path): with fname.open('w', encoding='utf-8') as fp: return json.dump(attr.asdict(self), fp, indent=4) @property def known_license(self) -> typing.Union[None, licenses.License]: if self.license: return licenses.find(self.license) @property def zenodo_license(self) -> str: if self.known_license and self.known_license.id in LICENSES: return self.known_license.id def common_props(self) -> typing.Dict[str, object]: """ The metadata as JSON-LD object suitable for inclusion in CLDF metadata. """ res = collections.OrderedDict() if self.title: res["dc:title"] = self.title if self.description: res["dc:description"] = self.description if self.citation: res["dc:bibliographicCitation"] = self.citation if self.url: res["dc:identifier"] = self.url if self.known_license: res['dc:license'] = self.known_license.url elif self.license: res['dc:license'] = self.license return res def markdown(self) -> str: lines = [ '# {0}\n'.format(self.title or 'Dataset {0}'.format(self.id)), '## How to cite\n\nIf you use these data please cite', ] if self.citation: lines.append('- the original source') lines.extend([" > {}".format(line) for line in self.citation.split('\n')]) lines.extend([ "- the derived dataset using the DOI of the " "[particular released version](../../releases/) you were using" ]) else: # pragma: no cover lines.extend([ "this dataset using the DOI of the " "[particular released version](../../releases/) you were using" ]) lines.append('\n## Description\n\n') if self.description: lines.append('{0}\n'.format(self.description)) if self.license: lines.append('This dataset is licensed under a %s license\n' % self.license) if self.url: lines.append('Available online at %s\n' % self.url) return '\n'.join(lines)
def get_creators_and_contributors(text, strict=True) -> typing.Tuple[list, list]: ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES} creators, contributors = [], [] # Read first table in CONTRIBUTORS.md try: header, rows = next(iter_markdown_tables(text)) except StopIteration: # pragma: no cover return creators, contributors for row in rows: row = {k.lower(): v for k, v in zip(header, row)} for role in nfilter([r.strip().lower() for r in row.get('role', '').split(',')]): c = {k: v for k, v in row.items() if k != 'role'} if role in {'author', 'creator', 'maintainer'}: if c not in creators: creators.append(c) else: if strict: c['type'] = ctypes[role] else: c['type'] = ctypes.get(role, 'Other') if c not in contributors: contributors.append(c) return creators, contributors