diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py index 455241cc..8d2e9f15 100644 --- a/bookwyrm/connectors/abstract_connector.py +++ b/bookwyrm/connectors/abstract_connector.py @@ -9,6 +9,7 @@ from requests.exceptions import RequestException from bookwyrm import activitypub, models, settings from .connector_manager import load_more_data, ConnectorException +from .format_mappings import format_mappings logger = logging.getLogger(__name__) @@ -312,3 +313,16 @@ class Mapping: return self.formatter(value) except: # pylint: disable=bare-except return None + +def infer_physical_format(format_text): + """ try to figure out what the standardized format is from the free value """ + format_text = format_text.lower() + if format_text in format_mappings: + # try a direct match + return format_mappings[format_text] + else: + # failing that, try substring + matches = [v for k, v in format_mappings.items() if k in format_text] + if not matches: + return None + return matches[0] diff --git a/bookwyrm/connectors/format_mappings.py b/bookwyrm/connectors/format_mappings.py new file mode 100644 index 00000000..61f61efa --- /dev/null +++ b/bookwyrm/connectors/format_mappings.py @@ -0,0 +1,43 @@ +""" comparing a free text format to the standardized one """ +format_mappings = { + "paperback": "Paperback", + "soft": "Paperback", + "pamphlet": "Paperback", + "peperback": "Paperback", + "tapa blanda": "Paperback", + "turtleback": "Paperback", + "pocket": "Paperback", + "spiral": "Paperback", + "ring": "Paperback", + "平装": "Paperback", + "简装": "Paperback", + "hardcover": "Hardcover", + "hardcocer": "Hardcover", + "hardover": "Hardcover", + "hardback": "Hardcover", + "library": "Hardcover", + "tapa dura": "Hardcover", + "leather": "Hardcover", + "clothbound": "Hardcover", + "精装": "Hardcover", + "ebook": "EBook", + "e-book": "EBook", + "digital": "EBook", + "computer file": "EBook", + "epub": "EBook", + "online": "EBook", + "pdf": "EBook", + "elektronische": "EBook", + "electronic": "EBook", + "audiobook": "AudiobookFormat", + "audio": "AudiobookFormat", + "cd": "AudiobookFormat", + "dvd": "AudiobookFormat", + "mp3": "AudiobookFormat", + "cassette": "AudiobookFormat", + "kindle": "AudiobookFormat", + "talking": "AudiobookFormat", + "sound": "AudiobookFormat", + "comic": "GraphicNovel", + "graphic": "GraphicNovel", +} diff --git a/bookwyrm/connectors/openlibrary.py b/bookwyrm/connectors/openlibrary.py index fca5d0f7..7f724d74 100644 --- a/bookwyrm/connectors/openlibrary.py +++ b/bookwyrm/connectors/openlibrary.py @@ -3,7 +3,7 @@ import re from bookwyrm import models from .abstract_connector import AbstractConnector, SearchResult, Mapping -from .abstract_connector import get_data +from .abstract_connector import get_data, infer_physical_format from .connector_manager import ConnectorException from .openlibrary_languages import languages @@ -43,7 +43,8 @@ class Connector(AbstractConnector): ), Mapping("publishedDate", remote_field="publish_date"), Mapping("pages", remote_field="number_of_pages"), - Mapping("physicalFormat", remote_field="physical_format"), + Mapping("physicalFormat", remote_field="physical_format", formatter=infer_physical_format), + Mapping("physicalFormatDetail", remote_field="physical_format"), Mapping("publishers"), ] diff --git a/bookwyrm/migrations/0101_auto_20210929_1847.py b/bookwyrm/migrations/0101_auto_20210929_1847.py index 346dbf88..2acaa127 100644 --- a/bookwyrm/migrations/0101_auto_20210929_1847.py +++ b/bookwyrm/migrations/0101_auto_20210929_1847.py @@ -2,6 +2,7 @@ from django.db import migrations import bookwyrm +from bookwyrm.connectors.abstract_connector import infer_physical_format def infer_format(app_registry, schema_editor): @@ -13,59 +14,10 @@ def infer_format(app_registry, schema_editor): .objects.using(db_alias) .filter(physical_format_detail__isnull=False) ) - mappings = { - "paperback": "Paperback", - "soft": "Paperback", - "pamphlet": "Paperback", - "peperback": "Paperback", - "tapa blanda": "Paperback", - "turtleback": "Paperback", - "pocket": "Paperback", - "spiral": "Paperback", - "ring": "Paperback", - "平装": "Paperback", - "简装": "Paperback", - "hardcover": "Hardcover", - "hardcocer": "Hardcover", - "hardover": "Hardcover", - "hardback": "Hardcover", - "library": "Hardcover", - "tapa dura": "Hardcover", - "leather": "Hardcover", - "clothbound": "Hardcover", - "精装": "Hardcover", - "ebook": "EBook", - "e-book": "EBook", - "digital": "EBook", - "computer file": "EBook", - "epub": "EBook", - "online": "EBook", - "pdf": "EBook", - "elektronische": "EBook", - "electronic": "EBook", - "audiobook": "AudiobookFormat", - "audio": "AudiobookFormat", - "cd": "AudiobookFormat", - "dvd": "AudiobookFormat", - "mp3": "AudiobookFormat", - "cassette": "AudiobookFormat", - "kindle": "AudiobookFormat", - "talking": "AudiobookFormat", - "sound": "AudiobookFormat", - "comic": "GraphicNovel", - "graphic": "GraphicNovel", - } for edition in editions: free_format = edition.physical_format_detail.lower() - if free_format in mappings: - edition.physical_format = mappings[free_format] - edition.save() - else: - matches = [v for k, v in mappings.items() if k in free_format] - if not matches: - continue - edition.physical_format = matches[0] - edition.save() + edition.physical_format = infer_physical_format(free_format) + edition.save() def reverse(app_registry, schema_editor):