Adds deduplication fields

This commit is contained in:
Mouse Reeve
2020-12-12 13:39:55 -08:00
parent 31a407d74a
commit 7c43fa1f7c
10 changed files with 174 additions and 73 deletions

View File

@ -12,11 +12,11 @@ from . import fields
class Author(ActivitypubMixin, BookWyrmModel):
''' basic biographic info '''
origin_id = models.CharField(max_length=255, null=True)
''' copy of an author from OL '''
openlibrary_key = fields.CharField(max_length=255, blank=True, null=True)
openlibrary_key = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
sync = models.BooleanField(default=True)
last_sync_date = models.DateTimeField(default=timezone.now)
wikipedia_link = fields.CharField(max_length=255, blank=True, null=True)
wikipedia_link = fields.CharField(max_length=255, blank=True, null=True, deduplication_field=True)
# idk probably other keys would be useful here?
born = fields.DateTimeField(blank=True, null=True)
died = fields.DateTimeField(blank=True, null=True)

View File

@ -1,5 +1,7 @@
''' base model with default fields '''
from base64 import b64encode
from functools import reduce
import operator
from uuid import uuid4
from Crypto.PublicKey import RSA
@ -7,6 +9,7 @@ from Crypto.Signature import pkcs1_15
from Crypto.Hash import SHA256
from django.core.paginator import Paginator
from django.db import models
from django.db.models import Q
from django.dispatch import receiver
from bookwyrm import activitypub
@ -64,6 +67,50 @@ class ActivitypubMixin:
activity_serializer = lambda: {}
reverse_unfurl = False
@classmethod
def find_existing_by_remote_id(cls, remote_id):
''' look up a remote id in the db '''
return cls.find_existing({'id': remote_id})
@classmethod
def find_existing(cls, data):
''' compare data to fields that can be used for deduplation.
This always includes remote_id, but can also be unique identifiers
like an isbn for an edition '''
filters = []
for field in cls._meta.get_fields():
if not hasattr(field, 'deduplication_field') or \
not field.deduplication_field:
continue
value = data.get(field.activitypub_field)
if not value:
continue
filters.append({field.name: value})
if hasattr(cls, 'origin_id') and 'id' in data:
# kinda janky, but this handles special case for books
filters.append({'origin_id': data['id']})
if not filters:
# if there are no deduplication fields, it will match the first
# item no matter what. this shouldn't happen but just in case.
return None
objects = cls.objects
if hasattr(objects, 'select_subclasses'):
objects = objects.select_subclasses()
# an OR operation on all the match fields
match = objects.filter(
reduce(
operator.or_, (Q(**f) for f in filters)
)
)
# there OUGHT to be only one match
return match.first()
def to_activity(self):
''' convert from a model to an activity '''
activity = {}

View File

@ -16,9 +16,12 @@ class Book(ActivitypubMixin, BookWyrmModel):
''' a generic book, which can mean either an edition or a work '''
origin_id = models.CharField(max_length=255, null=True, blank=True)
# these identifiers apply to both works and editions
openlibrary_key = fields.CharField(max_length=255, blank=True, null=True)
librarything_key = fields.CharField(max_length=255, blank=True, null=True)
goodreads_key = fields.CharField(max_length=255, blank=True, null=True)
openlibrary_key = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
librarything_key = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
goodreads_key = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
# info about where the data comes from and where/if to sync
sync = models.BooleanField(default=True)
@ -83,7 +86,8 @@ class Book(ActivitypubMixin, BookWyrmModel):
class Work(OrderedCollectionPageMixin, Book):
''' a work (an abstract concept of a book that manifests in an edition) '''
# library of congress catalog control number
lccn = fields.CharField(max_length=255, blank=True, null=True)
lccn = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
# this has to be nullable but should never be null
default_edition = fields.ForeignKey(
'Edition',
@ -103,10 +107,14 @@ class Work(OrderedCollectionPageMixin, Book):
class Edition(Book):
''' an edition of a book '''
# these identifiers only apply to editions, not works
isbn_10 = fields.CharField(max_length=255, blank=True, null=True)
isbn_13 = fields.CharField(max_length=255, blank=True, null=True)
oclc_number = fields.CharField(max_length=255, blank=True, null=True)
asin = fields.CharField(max_length=255, blank=True, null=True)
isbn_10 = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
isbn_13 = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
oclc_number = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
asin = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
pages = fields.IntegerField(blank=True, null=True)
physical_format = fields.CharField(max_length=255, blank=True, null=True)
publishers = fields.ArrayField(

View File

@ -28,7 +28,9 @@ def validate_remote_id(value):
class ActivitypubFieldMixin:
''' make a database field serializable '''
def __init__(self, *args, \
activitypub_field=None, activitypub_wrapper=None, **kwargs):
activitypub_field=None, activitypub_wrapper=None,
deduplication_field=False, **kwargs):
self.deduplication_field = deduplication_field
if activitypub_wrapper:
self.activitypub_wrapper = activitypub_field
self.activitypub_field = activitypub_wrapper
@ -86,6 +88,8 @@ class RemoteIdField(ActivitypubFieldMixin, models.CharField):
*args, max_length=max_length, validators=validators,
**kwargs
)
# for this field, the default is true. false everywhere else.
self.deduplication_field = kwargs.get('deduplication_field', True)
class UsernameField(ActivitypubFieldMixin, models.CharField):

View File

@ -32,7 +32,9 @@ class User(OrderedCollectionPageMixin, AbstractUser):
inbox = fields.RemoteIdField(unique=True)
shared_inbox = fields.RemoteIdField(
activitypub_field='sharedInbox',
activitypub_wrapper='endpoints', null=True)
activitypub_wrapper='endpoints',
deduplication_field=False,
null=True)
federated_server = models.ForeignKey(
'FederatedServer',
on_delete=models.PROTECT,