Refactors book ingest from openlibrary

it's very repetetive now but also works way better so who can say
This commit is contained in:
Mouse Reeve 2020-04-05 17:00:01 -07:00
parent 94efe860b0
commit 26d9454ec3
2 changed files with 132 additions and 95 deletions

View File

@ -1,6 +1,6 @@
''' openlibrary data connector ''' ''' openlibrary data connector '''
from django.core.exceptions import ObjectDoesNotExist
from django.core.files.base import ContentFile from django.core.files.base import ContentFile
from django.db import transaction
import re import re
import requests import requests
@ -13,6 +13,18 @@ from .openlibrary_languages import languages
class Connector(AbstractConnector): class Connector(AbstractConnector):
''' instantiate a connector for OL ''' ''' instantiate a connector for OL '''
def __init__(self, identifier): def __init__(self, identifier):
get_first = lambda a: a[0]
self.book_mappings = {
'publish_date': ('published_date', get_date),
'first_publish_date': ('first_published_date', get_date),
'description': ('description', get_description),
'isbn_13': ('isbn', get_first),
'oclc_numbers': ('oclc_number', get_first),
'lccn': ('lccn', get_first),
'languages': ('languages', get_languages),
'number_of_pages': ('pages', None),
'series': ('series', get_first),
}
super().__init__(identifier) super().__init__(identifier)
@ -52,120 +64,125 @@ class Connector(AbstractConnector):
openlibrary_key=olkey openlibrary_key=olkey
).first() ).first()
if book: if book:
if isinstance(book, models.Work):
return book.default_edition
return book return book
# no book was found, so we start creating a new one # no book was found, so we start creating a new one
model = models.Edition
if re.match(r'^OL\d+W$', olkey): if re.match(r'^OL\d+W$', olkey):
model = models.Work with transaction.atomic():
book = model(openlibrary_key=olkey) # create both work and a default edition
return self.update_book(book) work_data = self.load_book_data(olkey)
work = self.create_book(olkey, work_data, models.Work)
edition_options = self.load_edition_data(olkey).get('entries')
edition_data = pick_default_edition(edition_options)
key = edition_data.get('key').split('/')[-1]
edition = self.create_book(key, edition_data, models.Edition)
edition.parent_work = work
edition.save()
else:
with transaction.atomic():
edition_data = self.load_book_data(olkey)
edition = self.create_book(olkey, edition_data, models.Edition)
work_key = edition_data.get('works')[0]['key'].split('/')[-1]
work = models.Work.objects.filter(
openlibrary_key=work_key
).first()
if not work:
work_data = self.load_book_data(work_key)
work = self.create_book(work_key, work_data, models.Work)
edition.parent_work = work
edition.save()
if not edition.authors and work.authors:
edition.authors.set(work.authors.all())
return edition
def create_book(self, key, data, model):
''' create a work or edition from data '''
book = model.objects.create(
openlibrary_key=key,
title=data['title'],
connector=self.connector,
)
return self.update_book_from_data(book, data)
def update_book_from_data(self, book, data):
''' updaet a book model instance from ol data '''
# populate the simple data fields
update_from_mappings(book, data, self.book_mappings)
book.save()
for author in self.get_authors_from_data(data):
book.authors.add(author)
if data.get('covers'):
book.cover.save(*self.get_cover(data['covers'][0]), save=True)
return book
def update_book(self, book): def update_book(self, book):
''' load new data '''
if not book.sync and not book.sync_cover:
return
data = self.load_book_data(book.openlibrary_key)
if book.sync_cover and data.get('covers'):
book.cover.save(*self.get_cover(data['covers'][0]), save=True)
if book.sync:
book = self.update_book_from_data(book, data)
return book
def get_authors_from_data(self, data):
''' parse author json and load or create authors '''
authors = []
for author_blob in data.get('authors', []):
# this id is "/authors/OL1234567A" and we want just "OL1234567A"
author_blob = author_blob.get('author', author_blob)
author_id = author_blob['key'].split('/')[-1]
authors.append(self.get_or_create_author(author_id))
return authors
def load_book_data(self, olkey):
''' query openlibrary for data on a book ''' ''' query openlibrary for data on a book '''
olkey = book.openlibrary_key
# load the book json from openlibrary.org
response = requests.get('%s/works/%s.json' % (self.url, olkey)) response = requests.get('%s/works/%s.json' % (self.url, olkey))
if not response.ok: if not response.ok:
response.raise_for_status() response.raise_for_status()
data = response.json() data = response.json()
if not book.source_url: return data
book.source_url = response.url
return self.update_from_data(book, data)
def update_from_data(self, book, data): def load_edition_data(self, olkey):
''' update a book from a json blob ''' ''' query openlibrary for editions of a work '''
mappings = { response = requests.get(
'publish_date': ('published_date', get_date), '%s/works/%s/editions.json' % (self.url, olkey))
'first_publish_date': ('first_published_date', get_date), if not response.ok:
'description': ('description', get_description), response.raise_for_status()
'isbn_13': ('isbn', lambda a: a[0]), data = response.json()
'oclc_numbers': ('oclc_number', lambda a: a[0]), return data
'lccn': ('lccn', lambda a: a[0]),
'languages': ('languages', get_languages),
'number_of_pages': ('pages', None),
'series': ('series', lambda a: a[0]),
}
book = update_from_mappings(book, data, mappings)
if 'identifiers' in data:
if 'goodreads' in data['identifiers']:
book.goodreads_key = data['identifiers']['goodreads'][0]
if 'series' in data and len(data['series']) > 1:
book.series_number = data['series'][1]
if not book.connector:
book.connector = self.connector
book.save()
# this book sure as heck better be an edition
if data.get('works'):
key = data.get('works')[0]['key']
key = key.split('/')[-1]
work = self.get_or_create_book(key)
book.parent_work = work
if isinstance(book, models.Work):
# load editions of a work
self.get_editions_of_work(book)
# we also need to know the author get the cover
for author_blob in data.get('authors', []):
# this id is "/authors/OL1234567A" and we want just "OL1234567A"
author_blob = author_blob.get('author', author_blob)
author_id = author_blob['key']
author_id = author_id.split('/')[-1]
book.authors.add(self.get_or_create_author(author_id))
if not data.get('authors') and book.parent_work.authors.count():
book.authors.set(book.parent_work.authors.all())
if book.sync_cover and data.get('covers') and len(data['covers']):
book.cover.save(*self.get_cover(data['covers'][0]), save=True)
return book
def expand_book_data(self, book): def expand_book_data(self, book):
work = book work = book
if isinstance(book, models.Edition): if isinstance(book, models.Edition):
work = book.parent_work work = book.parent_work
self.get_editions_of_work(work, default_only=False)
edition_options = self.load_edition_data(work.openlibrary_key)
def get_editions_of_work(self, work, default_only=True): for edition_data in edition_options.get('entries'):
''' get all editions of a work ''' olkey = edition_data.get('key').split('/')[-1]
response = requests.get( if models.Edition.objects.filter(openlibrary_key=olkey).count():
'%s/works/%s/editions.json' % (self.url, work.openlibrary_key)) continue
edition_data = response.json() edition = self.create_book(olkey, edition_data, models.Edition)
edition.parent_work = work
options = edition_data.get('entries', []) edition.save()
if default_only and len(options) > 1: if not edition.authors and work.authors:
options = [e for e in options if e.get('cover')] or options edition.authors.set(work.authors.all())
options = [e for e in options if \
'/languages/eng' in str(e.get('languages'))] or options
formats = ['paperback', 'hardcover', 'mass market paperback']
options = [e for e in options if \
str(e.get('physical_format')).lower() in formats] or options
options = [e for e in options if e.get('isbn_13')] or options
options = [e for e in options if e.get('ocaid')] or options
if not options:
options = edition_data.get('entries', [])
options = options[:1]
for data in options:
try:
olkey = data['key'].split('/')[-1]
except KeyError:
# bad data I guess?
return
try:
models.Edition.objects.get(openlibrary_key=olkey)
except models.Edition.DoesNotExist:
book = models.Edition.objects.create(openlibrary_key=olkey)
self.update_from_data(book, data)
def get_or_create_author(self, olkey): def get_or_create_author(self, olkey):
@ -228,3 +245,21 @@ def get_languages(language_blob):
return langs return langs
def pick_default_edition(options):
''' favor physical copies with covers in english '''
if not len(options):
return None
if len(options) == 1:
return options[0]
options = [e for e in options if e.get('cover')] or options
options = [e for e in options if \
'/languages/eng' in str(e.get('languages'))] or options
formats = ['paperback', 'hardcover', 'mass market paperback']
options = [e for e in options if \
str(e.get('physical_format')).lower() in formats] or options
options = [e for e in options if e.get('isbn_13')] or options
options = [e for e in options if e.get('ocaid')] or options
return options[0]

View File

@ -352,6 +352,8 @@ def book_page(request, book_identifier, tab='friends'):
if isinstance(book, models.Work): if isinstance(book, models.Work):
book = book.default_edition book = book.default_edition
if not book:
return HttpResponseNotFound()
work = book.parent_work work = book.parent_work
if not work: if not work: