diff --git a/bookwyrm/sanitize_html.py b/bookwyrm/sanitize_html.py
index 8b0e3c4c..4edd2818 100644
--- a/bookwyrm/sanitize_html.py
+++ b/bookwyrm/sanitize_html.py
@@ -22,6 +22,7 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
"ol",
"li",
]
+ self.allowed_attrs = ["href", "rel", "src", "alt"]
self.tag_stack = []
self.output = []
# if the html appears invalid, we just won't allow any at all
@@ -30,7 +31,14 @@ class InputHtmlParser(HTMLParser): # pylint: disable=abstract-method
def handle_starttag(self, tag, attrs):
"""check if the tag is valid"""
if self.allow_html and tag in self.allowed_tags:
- self.output.append(("tag", self.get_starttag_text()))
+ allowed_attrs = " ".join(
+ f'{a}="{v}"' for a, v in attrs if a in self.allowed_attrs
+ )
+ reconstructed = f"<{tag}"
+ if allowed_attrs:
+ reconstructed += " " + allowed_attrs
+ reconstructed += ">"
+ self.output.append(("tag", reconstructed))
self.tag_stack.append(tag)
else:
self.output.append(("data", ""))
diff --git a/bookwyrm/tests/test_sanitize_html.py b/bookwyrm/tests/test_sanitize_html.py
index 6c405348..5814f220 100644
--- a/bookwyrm/tests/test_sanitize_html.py
+++ b/bookwyrm/tests/test_sanitize_html.py
@@ -24,13 +24,21 @@ class Sanitizer(TestCase):
self.assertEqual(input_text, output)
def test_valid_html_attrs(self):
- """and don't remove attributes"""
+ """and don't remove useful attributes"""
input_text = 'yes html'
parser = InputHtmlParser()
parser.feed(input_text)
output = parser.get_output()
self.assertEqual(input_text, output)
+ def test_valid_html_invalid_attrs(self):
+ """do remove un-approved attributes"""
+ input_text = 'yes html'
+ parser = InputHtmlParser()
+ parser.feed(input_text)
+ output = parser.get_output()
+ self.assertEqual(output, 'yes html')
+
def test_invalid_html(self):
"""remove all html when the html is malformed"""
input_text = "yes html"