diff --git a/bookwyrm/sanitize_html.py b/bookwyrm/sanitize_html.py index 8b0e3c4c..4edd2818 100644 --- a/bookwyrm/sanitize_html.py +++ b/bookwyrm/sanitize_html.py @@ -22,6 +22,7 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method "ol", "li", ] + self.allowed_attrs = ["href", "rel", "src", "alt"] self.tag_stack = [] self.output = [] # if the html appears invalid, we just won't allow any at all @@ -30,7 +31,14 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method def handle_starttag(self, tag, attrs): """check if the tag is valid""" if self.allow_html and tag in self.allowed_tags: - self.output.append(("tag", self.get_starttag_text())) + allowed_attrs = " ".join( + f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs + ) + reconstructed = f"<{tag}" + if allowed_attrs: + reconstructed += " " + allowed_attrs + reconstructed += ">" + self.output.append(("tag", reconstructed)) self.tag_stack.append(tag) else: self.output.append(("data", "")) diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py index 6c405348..5814f220 100644 --- a/bookwyrm/tests/test_sanitize_html.py +++ b/bookwyrm/tests/test_sanitize_html.py @@ -24,13 +24,21 @@ class Sanitizer(TestCase): self.assertEqual(input_text, output) def test_valid_html_attrs(self): - """and don't remove attributes""" + """and don't remove useful attributes""" input_text = 'yes html' parser = InputHtmlParser() parser.feed(input_text) output = parser.get_output() self.assertEqual(input_text, output) + def test_valid_html_invalid_attrs(self): + """do remove un-approved attributes""" + input_text = 'yes html' + parser = InputHtmlParser() + parser.feed(input_text) + output = parser.get_output() + self.assertEqual(output, 'yes html') + def test_invalid_html(self): """remove all html when the html is malformed""" input_text = "yes html"