Sanitize incoming html

This commit is contained in:
Mouse Reeve 2020-12-16 16:47:05 -08:00
parent d79a756813
commit a3c7d324d6
8 changed files with 62 additions and 11 deletions

View File

@ -0,0 +1,39 @@
# Generated by Django 3.0.7 on 2020-12-17 00:46
import bookwyrm.models.fields
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('bookwyrm', '0024_merge_20201216_1721'),
]
operations = [
migrations.AlterField(
model_name='author',
name='bio',
field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
),
migrations.AlterField(
model_name='book',
name='description',
field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
),
migrations.AlterField(
model_name='quotation',
name='quote',
field=bookwyrm.models.fields.HtmlField(),
),
migrations.AlterField(
model_name='status',
name='content',
field=bookwyrm.models.fields.HtmlField(blank=True, null=True),
),
migrations.AlterField(
model_name='user',
name='summary',
field=bookwyrm.models.fields.HtmlField(default=''),
),
]

View File

@ -25,7 +25,7 @@ class Author(ActivitypubMixin, BookWyrmModel):
aliases = fields.ArrayField( aliases = fields.ArrayField(
models.CharField(max_length=255), blank=True, default=list models.CharField(max_length=255), blank=True, default=list
) )
bio = fields.TextField(null=True, blank=True) bio = fields.HtmlField(null=True, blank=True)
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
''' can't be abstract for query reasons, but you shouldn't USE it ''' ''' can't be abstract for query reasons, but you shouldn't USE it '''

View File

@ -36,7 +36,7 @@ class Book(ActivitypubMixin, BookWyrmModel):
title = fields.CharField(max_length=255) title = fields.CharField(max_length=255)
sort_title = fields.CharField(max_length=255, blank=True, null=True) sort_title = fields.CharField(max_length=255, blank=True, null=True)
subtitle = fields.CharField(max_length=255, blank=True, null=True) subtitle = fields.CharField(max_length=255, blank=True, null=True)
description = fields.TextField(blank=True, null=True) description = fields.HtmlField(blank=True, null=True)
languages = fields.ArrayField( languages = fields.ArrayField(
models.CharField(max_length=255), blank=True, default=list models.CharField(max_length=255), blank=True, default=list
) )

View File

@ -12,6 +12,7 @@ from django.db import models
from django.utils import timezone from django.utils import timezone
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from bookwyrm import activitypub from bookwyrm import activitypub
from bookwyrm.sanitize_html import InputHtmlParser
from bookwyrm.settings import DOMAIN from bookwyrm.settings import DOMAIN
from bookwyrm.connectors import get_image from bookwyrm.connectors import get_image
@ -362,6 +363,15 @@ class DateTimeField(ActivitypubFieldMixin, models.DateTimeField):
except (ParserError, TypeError): except (ParserError, TypeError):
return None return None
class HtmlField(ActivitypubFieldMixin, models.TextField):
''' a text field for storing html '''
def field_from_activity(self, value):
if not value or value == MISSING:
return None
sanitizer = InputHtmlParser()
sanitizer.feed(value)
return sanitizer.get_output()
class ArrayField(ActivitypubFieldMixin, DjangoArrayField): class ArrayField(ActivitypubFieldMixin, DjangoArrayField):
''' activitypub-aware array field ''' ''' activitypub-aware array field '''
def field_to_activity(self, value): def field_to_activity(self, value):

View File

@ -14,7 +14,7 @@ class Status(OrderedCollectionPageMixin, BookWyrmModel):
''' any post, like a reply to a review, etc ''' ''' any post, like a reply to a review, etc '''
user = fields.ForeignKey( user = fields.ForeignKey(
'User', on_delete=models.PROTECT, activitypub_field='attributedTo') 'User', on_delete=models.PROTECT, activitypub_field='attributedTo')
content = fields.TextField(blank=True, null=True) content = fields.HtmlField(blank=True, null=True)
mention_users = fields.TagField('User', related_name='mention_user') mention_users = fields.TagField('User', related_name='mention_user')
mention_books = fields.TagField('Edition', related_name='mention_book') mention_books = fields.TagField('Edition', related_name='mention_book')
local = models.BooleanField(default=True) local = models.BooleanField(default=True)
@ -134,7 +134,7 @@ class Comment(Status):
class Quotation(Status): class Quotation(Status):
''' like a review but without a rating and transient ''' ''' like a review but without a rating and transient '''
quote = fields.TextField() quote = fields.HtmlField()
book = fields.ForeignKey( book = fields.ForeignKey(
'Edition', on_delete=models.PROTECT, activitypub_field='inReplyToBook') 'Edition', on_delete=models.PROTECT, activitypub_field='inReplyToBook')

View File

@ -42,7 +42,7 @@ class User(OrderedCollectionPageMixin, AbstractUser):
blank=True, blank=True,
) )
outbox = fields.RemoteIdField(unique=True) outbox = fields.RemoteIdField(unique=True)
summary = fields.TextField(default='') summary = fields.HtmlField(default='')
local = models.BooleanField(default=False) local = models.BooleanField(default=False)
bookwyrm_user = fields.BooleanField(default=True) bookwyrm_user = fields.BooleanField(default=True)
localname = models.CharField( localname = models.CharField(

View File

@ -1,7 +1,7 @@
''' html parser to clean up incoming text from unknown sources ''' ''' html parser to clean up incoming text from unknown sources '''
from html.parser import HTMLParser from html.parser import HTMLParser
class InputHtmlParser(HTMLParser): class InputHtmlParser(HTMLParser):#pylint: disable=abstract-method
''' Removes any html that isn't allowed_tagsed from a block ''' ''' Removes any html that isn't allowed_tagsed from a block '''
def __init__(self): def __init__(self):

View File

@ -1,34 +1,36 @@
''' make sure only valid html gets to the app '''
from django.test import TestCase from django.test import TestCase
from bookwyrm.sanitize_html import InputHtmlParser from bookwyrm.sanitize_html import InputHtmlParser
class Sanitizer(TestCase): class Sanitizer(TestCase):
''' sanitizer tests '''
def test_no_html(self): def test_no_html(self):
''' just text '''
input_text = 'no html ' input_text = 'no html '
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)
output = parser.get_output() output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_valid_html(self): def test_valid_html(self):
''' leave the html untouched '''
input_text = '<b>yes </b> <i>html</i>' input_text = '<b>yes </b> <i>html</i>'
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)
output = parser.get_output() output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_valid_html_attrs(self): def test_valid_html_attrs(self):
''' and don't remove attributes '''
input_text = '<a href="fish.com">yes </a> <i>html</i>' input_text = '<a href="fish.com">yes </a> <i>html</i>'
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)
output = parser.get_output() output = parser.get_output()
self.assertEqual(input_text, output) self.assertEqual(input_text, output)
def test_invalid_html(self): def test_invalid_html(self):
''' remove all html when the html is malformed '''
input_text = '<b>yes <i>html</i>' input_text = '<b>yes <i>html</i>'
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)
@ -41,8 +43,8 @@ class Sanitizer(TestCase):
output = parser.get_output() output = parser.get_output()
self.assertEqual('yes html ', output) self.assertEqual('yes html ', output)
def test_disallowed_html(self): def test_disallowed_html(self):
''' remove disallowed html but keep allowed html '''
input_text = '<div> yes <i>html</i></div>' input_text = '<div> yes <i>html</i></div>'
parser = InputHtmlParser() parser = InputHtmlParser()
parser.feed(input_text) parser.feed(input_text)