Source code for mardi_importer.zbmath.misc

from habanero import Crossref
from requests.exceptions import HTTPError


[docs] def get_tag(tag_name, namespace): """ Returns a fully qualified tag name. Args: tag_name (string): name of tag, e.g. author namespace (string): namespace URL of a namespace """ return "{{{}}}{}".format(namespace, tag_name)
[docs] def parse_doi_info(val, work_info): """ Function to extract information returned by a doi query for a specific tag. Args: val (string): tag, e.g. author work_info (dict): information from doi query response Returns: string: information for specific tag, None if not found """ # information about return fields can be found under https://api.crossref.org/swagger-ui/index.html#/Works/get_works if val == "author": # author and the familiy subfield are mandatory fields in crossref api # looks like: 'author': [{'given': 'Max', 'family': 'Mustermann', 'sequence': 'first', 'affiliation': []}] if "author" not in work_info: return None first_name = "" family_name = "" author_list = [] for author_dict in work_info["author"]: # family name not known: too little information if "family" not in author_dict: return None family_name = author_dict["family"] # family name not known; too little information if not family_name: return None if "given" in author_dict: first_name = author_dict["given"] # first name not necessarily needed if not first_name: author_list.append(family_name) else: author_list.append(family_name + ", " + first_name) return ";".join(author_list) elif val == "document_title": if "document_title" not in work_info: return None title_list = work_info["title"] if title_list: return ";".join(title_list) else: return None elif val == "publication_year": # date-parts is a mandaory field for published in crossref api # 'published': {'date-parts': [[2008]]}} this is not necessarily the year this was published in the journal, apparently... if "published" not in work_info: return None # this is either a year or None return work_info["published"]["date_parts"][0][0] elif val == "serial": if "reference" not in work_info: return None serials = [] for serial_dict in work_info["reference"]: if "journal_title" in serial_dict: serials.append(serial_dict["journal-title"]) # if no serials were found if not serials: return None # make list unique serials = list(set(serials)) return ";".join(serials) elif val == "language": if "language" not in work_info: return None return work_info["language"] elif val == "keywords": if "subject" not in work_info: return None return ";".join(work_info["subject"])
[docs] def get_info_from_doi(doi, key): """ Query crossref API for DOI information. Args: doi: doi key: document_title only for now Returns: title: document title """ doi_list = doi.split(";") # print("doi") # print(doi) # print("doi list") # print(doi_list) cr = Crossref(mailto="pusch@zib.de") for doi in doi_list: try: work_info = cr.works(ids=doi) if not work_info: continue if key == "document_title": if "title" not in work_info["message"]: continue title_list = work_info["message"]["title"] if title_list: joint_title = ";".join(title_list).strip() joint_title = joint_title.replace("\n", " ").strip() joint_title = joint_title.replace("\t", " ").strip() if len(joint_title) > 500: return None return joint_title else: continue elif key == "journal": if "container-title" not in work_info["message"]: return None if not work_info["message"]["container-title"]: return None journal = work_info["message"]["container-title"][0].strip() return journal # if the doi is not found, there is a 404 except HTTPError: print("HTTP Error!") continue return None