From a3c7d324d67fe29202e76fb332b7c328e61faf3c Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Wed, 16 Dec 2020 16:47:05 -0800 Subject: [PATCH] Sanitize incoming html --- .../migrations/0025_auto_20201217_0046.py | 39 +++++++++++++++++++ bookwyrm/models/author.py | 2 +- bookwyrm/models/book.py | 2 +- bookwyrm/models/fields.py | 10 +++++ bookwyrm/models/status.py | 4 +- bookwyrm/models/user.py | 2 +- bookwyrm/sanitize_html.py | 2 +- bookwyrm/tests/test_sanitize_html.py | 12 +++--- 8 files changed, 62 insertions(+), 11 deletions(-) create mode 100644 bookwyrm/migrations/0025_auto_20201217_0046.py diff --git a/bookwyrm/migrations/0025_auto_20201217_0046.py b/bookwyrm/migrations/0025_auto_20201217_0046.py new file mode 100644 index 00000000..a3ffe8c1 --- /dev/null +++ b/bookwyrm/migrations/0025_auto_20201217_0046.py @@ -0,0 +1,39 @@ +# Generated by Django 3.0.7 on 2020-12-17 00:46 + +import bookwyrm.models.fields +from django.db import migrations + + +class Migration(migrations.Migration): + + dependencies = [ + ('bookwyrm', '0024_merge_20201216_1721'), + ] + + operations = [ + migrations.AlterField( + model_name='author', + name='bio', + field=bookwyrm.models.fields.HtmlField(blank=True, null=True), + ), + migrations.AlterField( + model_name='book', + name='description', + field=bookwyrm.models.fields.HtmlField(blank=True, null=True), + ), + migrations.AlterField( + model_name='quotation', + name='quote', + field=bookwyrm.models.fields.HtmlField(), + ), + migrations.AlterField( + model_name='status', + name='content', + field=bookwyrm.models.fields.HtmlField(blank=True, null=True), + ), + migrations.AlterField( + model_name='user', + name='summary', + field=bookwyrm.models.fields.HtmlField(default=''), + ), + ] diff --git a/bookwyrm/models/author.py b/bookwyrm/models/author.py index 331d2dd6..47714d4e 100644 --- a/bookwyrm/models/author.py +++ b/bookwyrm/models/author.py @@ -25,7 +25,7 @@ class Author(ActivitypubMixin, BookWyrmModel): aliases = fields.ArrayField( models.CharField(max_length=255), blank=True, default=list ) - bio = fields.TextField(null=True, blank=True) + bio = fields.HtmlField(null=True, blank=True) def save(self, *args, **kwargs): ''' can't be abstract for query reasons, but you shouldn't USE it ''' diff --git a/bookwyrm/models/book.py b/bookwyrm/models/book.py index bcd4bc04..3080a115 100644 --- a/bookwyrm/models/book.py +++ b/bookwyrm/models/book.py @@ -36,7 +36,7 @@ class Book(ActivitypubMixin, BookWyrmModel): title = fields.CharField(max_length=255) sort_title = fields.CharField(max_length=255, blank=True, null=True) subtitle = fields.CharField(max_length=255, blank=True, null=True) - description = fields.TextField(blank=True, null=True) + description = fields.HtmlField(blank=True, null=True) languages = fields.ArrayField( models.CharField(max_length=255), blank=True, default=list ) diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py index 5e12f5d5..52933715 100644 --- a/bookwyrm/models/fields.py +++ b/bookwyrm/models/fields.py @@ -12,6 +12,7 @@ from django.db import models from django.utils import timezone from django.utils.translation import gettext_lazy as _ from bookwyrm import activitypub +from bookwyrm.sanitize_html import InputHtmlParser from bookwyrm.settings import DOMAIN from bookwyrm.connectors import get_image @@ -362,6 +363,15 @@ class DateTimeField(ActivitypubFieldMixin, models.DateTimeField): except (ParserError, TypeError): return None +class HtmlField(ActivitypubFieldMixin, models.TextField): + ''' a text field for storing html ''' + def field_from_activity(self, value): + if not value or value == MISSING: + return None + sanitizer = InputHtmlParser() + sanitizer.feed(value) + return sanitizer.get_output() + class ArrayField(ActivitypubFieldMixin, DjangoArrayField): ''' activitypub-aware array field ''' def field_to_activity(self, value): diff --git a/bookwyrm/models/status.py b/bookwyrm/models/status.py index fcf4a290..66114e7c 100644 --- a/bookwyrm/models/status.py +++ b/bookwyrm/models/status.py @@ -14,7 +14,7 @@ class Status(OrderedCollectionPageMixin, BookWyrmModel): ''' any post, like a reply to a review, etc ''' user = fields.ForeignKey( 'User', on_delete=models.PROTECT, activitypub_field='attributedTo') - content = fields.TextField(blank=True, null=True) + content = fields.HtmlField(blank=True, null=True) mention_users = fields.TagField('User', related_name='mention_user') mention_books = fields.TagField('Edition', related_name='mention_book') local = models.BooleanField(default=True) @@ -134,7 +134,7 @@ class Comment(Status): class Quotation(Status): ''' like a review but without a rating and transient ''' - quote = fields.TextField() + quote = fields.HtmlField() book = fields.ForeignKey( 'Edition', on_delete=models.PROTECT, activitypub_field='inReplyToBook') diff --git a/bookwyrm/models/user.py b/bookwyrm/models/user.py index 63549d36..9d66eb5c 100644 --- a/bookwyrm/models/user.py +++ b/bookwyrm/models/user.py @@ -42,7 +42,7 @@ class User(OrderedCollectionPageMixin, AbstractUser): blank=True, ) outbox = fields.RemoteIdField(unique=True) - summary = fields.TextField(default='') + summary = fields.HtmlField(default='') local = models.BooleanField(default=False) bookwyrm_user = fields.BooleanField(default=True) localname = models.CharField( diff --git a/bookwyrm/sanitize_html.py b/bookwyrm/sanitize_html.py index 9c5ca73a..933fc43c 100644 --- a/bookwyrm/sanitize_html.py +++ b/bookwyrm/sanitize_html.py @@ -1,7 +1,7 @@ ''' html parser to clean up incoming text from unknown sources ''' from html.parser import HTMLParser -class InputHtmlParser(HTMLParser): +class InputHtmlParser(HTMLParser):#pylint: disable=abstract-method ''' Removes any html that isn't allowed_tagsed from a block ''' def __init__(self): diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py index 3344a934..58d94311 100644 --- a/bookwyrm/tests/test_sanitize_html.py +++ b/bookwyrm/tests/test_sanitize_html.py @@ -1,34 +1,36 @@ +''' make sure only valid html gets to the app ''' from django.test import TestCase from bookwyrm.sanitize_html import InputHtmlParser - class Sanitizer(TestCase): + ''' sanitizer tests ''' def test_no_html(self): + ''' just text ''' input_text = 'no html ' parser = InputHtmlParser() parser.feed(input_text) output = parser.get_output() self.assertEqual(input_text, output) - def test_valid_html(self): + ''' leave the html untouched ''' input_text = 'yes html' parser = InputHtmlParser() parser.feed(input_text) output = parser.get_output() self.assertEqual(input_text, output) - def test_valid_html_attrs(self): + ''' and don't remove attributes ''' input_text = 'yes html' parser = InputHtmlParser() parser.feed(input_text) output = parser.get_output() self.assertEqual(input_text, output) - def test_invalid_html(self): + ''' remove all html when the html is malformed ''' input_text = 'yes html' parser = InputHtmlParser() parser.feed(input_text) @@ -41,8 +43,8 @@ class Sanitizer(TestCase): output = parser.get_output() self.assertEqual('yes html ', output) - def test_disallowed_html(self): + ''' remove disallowed html but keep allowed html ''' input_text = '
yes html
' parser = InputHtmlParser() parser.feed(input_text)