bookwyrm-mastodon/bookwyrm/utils/isni.py

"""ISNI author checking utilities"""
import xml.etree.ElementTree as ET
import requests

# get data
BASE_STRING = "http://isni.oclc.org/sru/?query=pica.na+%3D+%22"
#pylint: disable=line-too-long
SUFFIX_STRING = "%22&version=1.1&operation=searchRetrieve&recordSchema=isni-b&maximumRecords=10&startRecord=1&recordPacking=xml&sortKeys=RLV%2Cpica%2C0%2C%2C"


def url_stringify(string):
    """replace spaces for url encoding"""
    return string.replace(" ", "+")


def find_authors_by_name(names):
    """Query the ISNI database for an author"""
    names = url_stringify(names)
    query = BASE_STRING + names + SUFFIX_STRING
    result = requests.get(query)
    # the OCLC ISNI server asserts the payload is encoded
    # in latin1, but we know better
    result.encoding = "utf-8"
    payload = result.text
    # parse xml
    root = ET.fromstring(payload)

    # build list of possible authors
    possible_authors = []
    for element in root.iter("responseRecord"):

        author = {}
        author["uri"] = element.find(".//isniURI").text
        # NOTE: this will often be incorrect, many naming systems 
        # list "surname" before personal name
        personal_name = element.find(".//forename/..")
        description = element.find(".//nameTitle")
        if personal_name:
            forename = personal_name.find(".//forename")
            surname = personal_name.find(".//surname")
            author["name"] = forename.text + " " + surname.text
            if description is not None:
                author["description"] = description.text

            possible_authors.append(author)

    return possible_authors
code formatting 2021-10-29 06:14:32 -04:00			`"""ISNI author checking utilities"""`
isni author lookup utility 2021-10-29 01:12:31 -04:00			`import xml.etree.ElementTree as ET`
code formatting 2021-10-29 06:14:32 -04:00			`import requests`
isni author lookup utility 2021-10-29 01:12:31 -04:00
			`# get data`
code formatting 2021-10-29 06:14:32 -04:00			`BASE_STRING = "http://isni.oclc.org/sru/?query=pica.na+%3D+%22"`
improve isni search logic - skip ISNIs that aren't persons - don't look for text values in non-existent nameTitle elements 2021-10-29 18:24:42 -04:00			`#pylint: disable=line-too-long`
code formatting 2021-10-29 06:14:32 -04:00			`SUFFIX_STRING = "%22&version=1.1&operation=searchRetrieve&recordSchema=isni-b&maximumRecords=10&startRecord=1&recordPacking=xml&sortKeys=RLV%2Cpica%2C0%2C%2C"`
isni author lookup utility 2021-10-29 01:12:31 -04:00

			`def url_stringify(string):`
code formatting 2021-10-29 06:14:32 -04:00			`"""replace spaces for url encoding"""`
isni author lookup utility 2021-10-29 01:12:31 -04:00			`return string.replace(" ", "+")`


			`def find_authors_by_name(names):`
code formatting 2021-10-29 06:14:32 -04:00			`"""Query the ISNI database for an author"""`
isni author lookup utility 2021-10-29 01:12:31 -04:00			`names = url_stringify(names)`
code formatting 2021-10-29 06:14:32 -04:00			`query = BASE_STRING + names + SUFFIX_STRING`
			`result = requests.get(query)`
fix encoding The OCLC server claims that the xml payload is encoded as latin1 (ISO-8859-1). This causes Requests to incorrectly encode things as latin1, when actually everything is (thank goodness) UTF-8. We can fix it by just telling Requests that it is really UTF-8 With thanks to Tex Texin, creator of http://i18nqa.com/debug/utf8-debug.html 2021-10-29 06:00:35 -04:00			`# the OCLC ISNI server asserts the payload is encoded`
			`# in latin1, but we know better`
code formatting 2021-10-29 06:14:32 -04:00			`result.encoding = "utf-8"`
			`payload = result.text`
fix encoding The OCLC server claims that the xml payload is encoded as latin1 (ISO-8859-1). This causes Requests to incorrectly encode things as latin1, when actually everything is (thank goodness) UTF-8. We can fix it by just telling Requests that it is really UTF-8 With thanks to Tex Texin, creator of http://i18nqa.com/debug/utf8-debug.html 2021-10-29 06:00:35 -04:00			`# parse xml`
isni author lookup utility 2021-10-29 01:12:31 -04:00			`root = ET.fromstring(payload)`

			`# build list of possible authors`
			`possible_authors = []`
code formatting 2021-10-29 06:14:32 -04:00			`for element in root.iter("responseRecord"):`
isni author lookup utility 2021-10-29 01:12:31 -04:00
code formatting 2021-10-29 06:14:32 -04:00			`author = {}`
			`author["uri"] = element.find(".//isniURI").text`
improve isni search logic - skip ISNIs that aren't persons - don't look for text values in non-existent nameTitle elements 2021-10-29 18:24:42 -04:00			`# NOTE: this will often be incorrect, many naming systems`
			`# list "surname" before personal name`
code formatting 2021-10-29 06:14:32 -04:00			`personal_name = element.find(".//forename/..")`
improve isni search logic - skip ISNIs that aren't persons - don't look for text values in non-existent nameTitle elements 2021-10-29 18:24:42 -04:00			`description = element.find(".//nameTitle")`
isni author lookup utility 2021-10-29 01:12:31 -04:00			`if personal_name:`
improve isni search logic - skip ISNIs that aren't persons - don't look for text values in non-existent nameTitle elements 2021-10-29 18:24:42 -04:00			`forename = personal_name.find(".//forename")`
			`surname = personal_name.find(".//surname")`
isni author lookup utility 2021-10-29 01:12:31 -04:00			`author["name"] = forename.text + " " + surname.text`
improve isni search logic - skip ISNIs that aren't persons - don't look for text values in non-existent nameTitle elements 2021-10-29 18:24:42 -04:00			`if description is not None:`
			`author["description"] = description.text`
isni author lookup utility 2021-10-29 01:12:31 -04:00
			`possible_authors.append(author)`

			`return possible_authors`