Merge pull request #1581 from hughrun/isni-poc

Query ISNI database when adding authors
This commit is contained in:
Mouse Reeve 2021-11-23 14:30:49 -08:00 committed by GitHub
commit a24fb5cd11
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 305 additions and 16 deletions

View File

@ -27,7 +27,7 @@ class Author(BookDataModel):
# idk probably other keys would be useful here? # idk probably other keys would be useful here?
born = fields.DateTimeField(blank=True, null=True) born = fields.DateTimeField(blank=True, null=True)
died = fields.DateTimeField(blank=True, null=True) died = fields.DateTimeField(blank=True, null=True)
name = fields.CharField(max_length=255, deduplication_field=True) name = fields.CharField(max_length=255)
aliases = fields.ArrayField( aliases = fields.ArrayField(
models.CharField(max_length=255), blank=True, default=list models.CharField(max_length=255), blank=True, default=list
) )

View File

@ -2,6 +2,7 @@
{% load i18n %} {% load i18n %}
{% load markdown %} {% load markdown %}
{% load humanize %} {% load humanize %}
{% load utilities %}
{% block title %}{{ author.name }}{% endblock %} {% block title %}{{ author.name }}{% endblock %}
@ -25,7 +26,7 @@
<div class="block columns content" itemscope itemtype="https://schema.org/Person"> <div class="block columns content" itemscope itemtype="https://schema.org/Person">
<meta itemprop="name" content="{{ author.name }}"> <meta itemprop="name" content="{{ author.name }}">
{% if author.aliases or author.born or author.died or author.wikipedia_link or author.openlibrary_key or author.inventaire_id %} {% if author.aliases or author.born or author.died or author.wikipedia_link or author.openlibrary_key or author.inventaire_id or author.isni %}
<div class="column is-two-fifths"> <div class="column is-two-fifths">
<div class="box py-2"> <div class="box py-2">
<dl> <dl>
@ -63,6 +64,14 @@
</p> </p>
{% endif %} {% endif %}
{% if author.isni %}
<p class="my-1">
<a itemprop="sameAs" href="https://isni.org/isni/{{ author.isni|remove_spaces }}" rel="noopener" target="_blank">
{% trans "View ISNI record" %}
</a>
</p>
{% endif %}
{% if author.openlibrary_key %} {% if author.openlibrary_key %}
<p class="my-1"> <p class="my-1">
<a itemprop="sameAs" href="https://openlibrary.org/authors/{{ author.openlibrary_key }}" target="_blank" rel="noopener"> <a itemprop="sameAs" href="https://openlibrary.org/authors/{{ author.openlibrary_key }}" target="_blank" rel="noopener">

View File

@ -1,6 +1,7 @@
{% extends 'layout.html' %} {% extends 'layout.html' %}
{% load i18n %} {% load i18n %}
{% load humanize %} {% load humanize %}
{% load utilities %}
{% block title %}{% if book %}{% blocktrans with book_title=book.title %}Edit "{{ book_title }}"{% endblocktrans %}{% else %}{% trans "Add Book" %}{% endif %}{% endblock %} {% block title %}{% if book %}{% blocktrans with book_title=book.title %}Edit "{{ book_title }}"{% endblocktrans %}{% else %}{% trans "Add Book" %}{% endif %}{% endblock %}
@ -52,19 +53,29 @@
{% for author in author_matches %} {% for author in author_matches %}
<fieldset> <fieldset>
<legend class="title is-5 mb-1"> <legend class="title is-5 mb-1">
{% blocktrans with name=author.name %}Is "{{ name }}" an existing author?{% endblocktrans %} {% blocktrans with name=author.name %}Is "{{ name }}" one of these authors?{% endblocktrans %}
</legend> </legend>
{% with forloop.counter0 as counter %} {% with forloop.counter0 as counter %}
{% for match in author.matches %} {% for match in author.matches %}
<label class="label mb-2"> <label class="label">
<input type="radio" name="author_match-{{ counter }}" value="{{ match.id }}" required> <input type="radio" name="author_match-{{ counter }}" value="{{ match.id }}" required>
{{ match.name }} {{ match.name }}
</label> </label>
<p class="help"> <p class="help ml-5 mb-2">
<a href="{{ match.local_path }}" target="_blank">{% blocktrans with book_title=match.book_set.first.title %}Author of <em>{{ book_title }}</em>{% endblocktrans %}</a> {% with book_title=match.book_set.first.title alt_title=match.bio %}
{% if book_title %}
<a href="{{ match.local_path }}" target="_blank">{% trans "Author of " %}<em>{{ book_title }}</em></a>
{% else %}
<a href="{{ match.id }}" target="_blank">{% if alt_title %}{% trans "Author of " %}<em>{{ alt_title }}</em>{% else %} {% trans "Find more information at isni.org" %}{% endif %}</a>
{% endif %}
{% endwith %}
</p> </p>
<p class="help ml-5">
{{ author.existing_isnis|get_isni_bio:match }}
</p>
{{ author.existing_isnis|get_isni:match }}
{% endfor %} {% endfor %}
<label class="label"> <label class="label mt-2">
<input type="radio" name="author_match-{{ counter }}" value="{{ author.name }}" required> {% trans "This is a new author" %} <input type="radio" name="author_match-{{ counter }}" value="{{ author.name }}" required> {% trans "This is a new author" %}
</label> </label>
{% endwith %} {% endwith %}

View File

@ -1,8 +1,11 @@
""" template filters for really common utilities """ """ template filters for really common utilities """
import os import os
import re
from uuid import uuid4 from uuid import uuid4
from django import template from django import template
from django.utils.safestring import mark_safe
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from django.template.defaultfilters import stringfilter
from django.templatetags.static import static from django.templatetags.static import static
@ -66,3 +69,39 @@ def get_book_cover_thumbnail(book, size="medium", ext="jpg"):
return cover_thumbnail.url return cover_thumbnail.url
except OSError: except OSError:
return static("images/no_cover.jpg") return static("images/no_cover.jpg")
@register.filter(name="get_isni_bio")
def get_isni_bio(existing, author):
"""Returns the isni bio string if an existing author has an isni listed"""
auth_isni = re.sub(r"\D", "", str(author.isni))
if len(existing) == 0:
return ""
for value in existing:
if hasattr(value, "bio") and auth_isni == re.sub(r"\D", "", str(value.isni)):
return mark_safe(f"Author of <em>{value.bio}</em>")
return ""
# pylint: disable=unused-argument
@register.filter(name="get_isni", needs_autoescape=True)
def get_isni(existing, author, autoescape=True):
"""Returns the isni ID if an existing author has an ISNI listing"""
auth_isni = re.sub(r"\D", "", str(author.isni))
if len(existing) == 0:
return ""
for value in existing:
if hasattr(value, "isni") and auth_isni == re.sub(r"\D", "", str(value.isni)):
isni = value.isni
return mark_safe(
f'<input type="text" name="isni-for-{author.id}" value="{isni}" hidden>'
)
return ""
@register.filter(name="remove_spaces")
@stringfilter
def remove_spaces(arg):
"""Removes spaces from argument passed in"""
return re.sub(r"\s", "", str(arg))

183
bookwyrm/utils/isni.py Normal file
View File

@ -0,0 +1,183 @@
"""ISNI author checking utilities"""
import xml.etree.ElementTree as ET
import requests
from bookwyrm import activitypub, models
def request_isni_data(search_index, search_term, max_records=5):
"""Request data from the ISNI API"""
search_string = f'{search_index}="{search_term}"'
query_params = {
"query": search_string,
"version": "1.1",
"operation": "searchRetrieve",
"recordSchema": "isni-b",
"maximumRecords": max_records,
"startRecord": "1",
"recordPacking": "xml",
"sortKeys": "RLV,pica,0,,",
}
result = requests.get("http://isni.oclc.org/sru/", params=query_params, timeout=10)
# the OCLC ISNI server asserts the payload is encoded
# in latin1, but we know better
result.encoding = "utf-8"
return result.text
def make_name_string(element):
"""create a string of form 'personal_name surname'"""
# NOTE: this will often be incorrect, many naming systems
# list "surname" before personal name
forename = element.find(".//forename")
surname = element.find(".//surname")
if forename is not None:
return "".join([forename.text, " ", surname.text])
return surname.text
def get_other_identifier(element, code):
"""Get other identifiers associated with an author from their ISNI record"""
identifiers = element.findall(".//otherIdentifierOfIdentity")
for section_head in identifiers:
if (
section_head.find(".//type") is not None
and section_head.find(".//type").text == code
and section_head.find(".//identifier") is not None
):
return section_head.find(".//identifier").text
# if we can't find it in otherIdentifierOfIdentity,
# try sources
for source in element.findall(".//sources"):
code_of_source = source.find(".//codeOfSource")
if code_of_source is not None and code_of_source.text.lower() == code.lower():
return source.find(".//sourceIdentifier").text
return ""
def get_external_information_uri(element, match_string):
"""Get URLs associated with an author from their ISNI record"""
sources = element.findall(".//externalInformation")
for source in sources:
information = source.find(".//information")
uri = source.find(".//URI")
if (
uri is not None
and information is not None
and information.text.lower() == match_string.lower()
):
return uri.text
return ""
def find_authors_by_name(name_string, description=False):
"""Query the ISNI database for possible author matches by name"""
payload = request_isni_data("pica.na", name_string)
# parse xml
root = ET.fromstring(payload)
# build list of possible authors
possible_authors = []
for element in root.iter("responseRecord"):
personal_name = element.find(".//forename/..")
if not personal_name:
continue
author = get_author_from_isni(element.find(".//isniUnformatted").text)
if bool(description):
titles = []
# prefer title records from LoC+ coop, Australia, Ireland, or Singapore
# in that order
for source in ["LCNACO", "NLA", "N6I", "NLB"]:
for parent in element.findall(f'.//titleOfWork/[@source="{source}"]'):
titles.append(parent.find(".//title"))
for parent in element.findall(f'.//titleOfWork[@subsource="{source}"]'):
titles.append(parent.find(".//title"))
# otherwise just grab the first title listing
titles.append(element.find(".//title"))
if titles is not None:
# some of the "titles" in ISNI are a little ...iffy
# '@' is used by ISNI/OCLC to index the starting point ignoring stop words
# (e.g. "The @Government of no one")
title_elements = [
e for e in titles if not e.text.replace("@", "").isnumeric()
]
if len(title_elements):
author.bio = title_elements[0].text.replace("@", "")
else:
author.bio = None
possible_authors.append(author)
return possible_authors
def get_author_from_isni(isni):
"""Find data to populate a new author record from their ISNI"""
payload = request_isni_data("pica.isn", isni)
# parse xml
root = ET.fromstring(payload)
# there should only be a single responseRecord
# but let's use the first one just in case
element = root.find(".//responseRecord")
name = make_name_string(element.find(".//forename/.."))
viaf = get_other_identifier(element, "viaf")
# use a set to dedupe aliases in ISNI
aliases = set()
aliases_element = element.findall(".//personalNameVariant")
for entry in aliases_element:
aliases.add(make_name_string(entry))
# aliases needs to be list not set
aliases = list(aliases)
bio = element.find(".//nameTitle")
bio = bio.text if bio is not None else ""
wikipedia = get_external_information_uri(element, "Wikipedia")
author = activitypub.Author(
id=element.find(".//isniURI").text,
name=name,
isni=isni,
viafId=viaf,
aliases=aliases,
bio=bio,
wikipediaLink=wikipedia,
)
return author
def build_author_from_isni(match_value):
"""Build basic author class object from ISNI URL"""
# if it is an isni value get the data
if match_value.startswith("https://isni.org/isni/"):
isni = match_value.replace("https://isni.org/isni/", "")
return {"author": get_author_from_isni(isni)}
# otherwise it's a name string
return {}
def augment_author_metadata(author, isni):
"""Update any missing author fields from ISNI data"""
isni_author = get_author_from_isni(isni)
isni_author.to_model(model=models.Author, instance=author, overwrite=False)
# we DO want to overwrite aliases because we're adding them to the
# existing aliases and ISNI will usually have more.
# We need to dedupe because ISNI records often have lots of dupe aliases
aliases = set(isni_author.aliases)
for alias in author.aliases:
aliases.add(alias)
author.aliases = list(aliases)
author.save()

View File

@ -1,4 +1,5 @@
""" the good stuff! the books! """ """ the good stuff! the books! """
from re import sub
from dateutil.parser import parse as dateparse from dateutil.parser import parse as dateparse
from django.contrib.auth.decorators import login_required, permission_required from django.contrib.auth.decorators import login_required, permission_required
from django.contrib.postgres.search import SearchRank, SearchVector from django.contrib.postgres.search import SearchRank, SearchVector
@ -11,10 +12,16 @@ from django.utils.decorators import method_decorator
from django.views import View from django.views import View
from bookwyrm import book_search, forms, models from bookwyrm import book_search, forms, models
# from bookwyrm.activitypub.base_activity import ActivityObject
from bookwyrm.utils.isni import (
find_authors_by_name,
build_author_from_isni,
augment_author_metadata,
)
from bookwyrm.views.helpers import get_edition from bookwyrm.views.helpers import get_edition
from .books import set_cover_from_url from .books import set_cover_from_url
# pylint: disable=no-self-use # pylint: disable=no-self-use
@method_decorator(login_required, name="dispatch") @method_decorator(login_required, name="dispatch")
@method_decorator( @method_decorator(
@ -33,6 +40,7 @@ class EditBook(View):
data = {"book": book, "form": forms.EditionForm(instance=book)} data = {"book": book, "form": forms.EditionForm(instance=book)}
return TemplateResponse(request, "book/edit/edit_book.html", data) return TemplateResponse(request, "book/edit/edit_book.html", data)
# pylint: disable=too-many-locals
def post(self, request, book_id=None): def post(self, request, book_id=None):
"""edit a book cool""" """edit a book cool"""
# returns None if no match is found # returns None if no match is found
@ -48,6 +56,7 @@ class EditBook(View):
if add_author: if add_author:
data["add_author"] = add_author data["add_author"] = add_author
data["author_matches"] = [] data["author_matches"] = []
data["isni_matches"] = []
for author in add_author.split(","): for author in add_author.split(","):
if not author: if not author:
continue continue
@ -56,15 +65,35 @@ class EditBook(View):
"aliases", weight="B" "aliases", weight="B"
) )
data["author_matches"].append( author_matches = (
{
"name": author.strip(),
"matches": (
models.Author.objects.annotate(search=vector) models.Author.objects.annotate(search=vector)
.annotate(rank=SearchRank(vector, author)) .annotate(rank=SearchRank(vector, author))
.filter(rank__gt=0.4) .filter(rank__gt=0.4)
.order_by("-rank")[:5] .order_by("-rank")[:5]
), )
isni_authors = find_authors_by_name(
author, description=True
) # find matches from ISNI API
# dedupe isni authors we already have in the DB
exists = [
i
for i in isni_authors
for a in author_matches
if sub(r"\D", "", str(i.isni)) == sub(r"\D", "", str(a.isni))
]
# pylint: disable=cell-var-from-loop
matches = list(filter(lambda x: x not in exists, isni_authors))
# combine existing and isni authors
matches.extend(author_matches)
data["author_matches"].append(
{
"name": author.strip(),
"matches": matches,
"existing_isnis": exists,
} }
) )
@ -122,6 +151,8 @@ class EditBook(View):
class ConfirmEditBook(View): class ConfirmEditBook(View):
"""confirm edits to a book""" """confirm edits to a book"""
# pylint: disable=too-many-locals
# pylint: disable=too-many-branches
def post(self, request, book_id=None): def post(self, request, book_id=None):
"""edit a book cool""" """edit a book cool"""
# returns None if no match is found # returns None if no match is found
@ -147,8 +178,24 @@ class ConfirmEditBook(View):
author = get_object_or_404( author = get_object_or_404(
models.Author, id=request.POST[f"author_match-{i}"] models.Author, id=request.POST[f"author_match-{i}"]
) )
# update author metadata if the ISNI record is more complete
isni = request.POST.get(f"isni-for-{match}", None)
if isni is not None:
augment_author_metadata(author, isni)
except ValueError: except ValueError:
# otherwise it's a name # otherwise it's a new author
isni_match = request.POST.get(f"author_match-{i}")
author_object = build_author_from_isni(isni_match)
# with author data class from isni id
if "author" in author_object:
skeleton = models.Author.objects.create(
name=author_object["author"].name
)
author = author_object["author"].to_model(
model=models.Author, overwrite=True, instance=skeleton
)
else:
# or it's just a name
author = models.Author.objects.create(name=match) author = models.Author.objects.create(name=match)
book.authors.add(author) book.authors.add(author)