1 """A cleanup tool for HTML.
2
3 Removes unwanted tags and content. See the `Cleaner` class for
4 details.
5 """
6
7 import re
8 import copy
9 try:
10 from urlparse import urlsplit
11 from urllib import unquote_plus
12 except ImportError:
13
14 from urllib.parse import urlsplit, unquote_plus
15 from lxml import etree
16 from lxml.html import defs
17 from lxml.html import fromstring, XHTML_NAMESPACE
18 from lxml.html import xhtml_to_html, _transform_result
19
20 try:
21 unichr
22 except NameError:
23
24 unichr = chr
25 try:
26 unicode
27 except NameError:
28
29 unicode = str
30 try:
31 bytes
32 except NameError:
33
34 bytes = str
35 try:
36 basestring
37 except NameError:
38 basestring = (str, bytes)
39
40
41 __all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html',
42 'word_break', 'word_break_html']
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65 _css_javascript_re = re.compile(
66 r'expression\s*\(.*?\)', re.S|re.I)
67
68
69 _css_import_re = re.compile(
70 r'@\s*import', re.I)
71
72
73
74 _is_image_dataurl = re.compile(
75 r'^data:image/.+;base64', re.I).search
76 _is_possibly_malicious_scheme = re.compile(
77 r'(?:javascript|jscript|livescript|vbscript|data|about|mocha):',
78 re.I).search
83
84 _substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub
85
86
87
88 _conditional_comment_re = re.compile(
89 r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S)
90
91 _find_styled_elements = etree.XPath(
92 "descendant-or-self::*[@style]")
93
94 _find_external_links = etree.XPath(
95 ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |"
96 "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"),
97 namespaces={'x':XHTML_NAMESPACE})
98
99
101 """
102 Instances cleans the document of each of the possible offending
103 elements. The cleaning is controlled by attributes; you can
104 override attributes in a subclass, or set them in the constructor.
105
106 ``scripts``:
107 Removes any ``<script>`` tags.
108
109 ``javascript``:
110 Removes any Javascript, like an ``onclick`` attribute. Also removes stylesheets
111 as they could contain Javascript.
112
113 ``comments``:
114 Removes any comments.
115
116 ``style``:
117 Removes any style tags.
118
119 ``inline_style``
120 Removes any style attributes. Defaults to the value of the ``style`` option.
121
122 ``links``:
123 Removes any ``<link>`` tags
124
125 ``meta``:
126 Removes any ``<meta>`` tags
127
128 ``page_structure``:
129 Structural parts of a page: ``<head>``, ``<html>``, ``<title>``.
130
131 ``processing_instructions``:
132 Removes any processing instructions.
133
134 ``embedded``:
135 Removes any embedded objects (flash, iframes)
136
137 ``frames``:
138 Removes any frame-related tags
139
140 ``forms``:
141 Removes any form tags
142
143 ``annoying_tags``:
144 Tags that aren't *wrong*, but are annoying. ``<blink>`` and ``<marquee>``
145
146 ``remove_tags``:
147 A list of tags to remove. Only the tags will be removed,
148 their content will get pulled up into the parent tag.
149
150 ``kill_tags``:
151 A list of tags to kill. Killing also removes the tag's content,
152 i.e. the whole subtree, not just the tag itself.
153
154 ``allow_tags``:
155 A list of tags to include (default include all).
156
157 ``remove_unknown_tags``:
158 Remove any tags that aren't standard parts of HTML.
159
160 ``safe_attrs_only``:
161 If true, only include 'safe' attributes (specifically the list
162 from the feedparser HTML sanitisation web site).
163
164 ``safe_attrs``:
165 A set of attribute names to override the default list of attributes
166 considered 'safe' (when safe_attrs_only=True).
167
168 ``add_nofollow``:
169 If true, then any <a> tags will have ``rel="nofollow"`` added to them.
170
171 ``host_whitelist``:
172 A list or set of hosts that you can use for embedded content
173 (for content like ``<object>``, ``<link rel="stylesheet">``, etc).
174 You can also implement/override the method
175 ``allow_embedded_url(el, url)`` or ``allow_element(el)`` to
176 implement more complex rules for what can be embedded.
177 Anything that passes this test will be shown, regardless of
178 the value of (for instance) ``embedded``.
179
180 Note that this parameter might not work as intended if you do not
181 make the links absolute before doing the cleaning.
182
183 Note that you may also need to set ``whitelist_tags``.
184
185 ``whitelist_tags``:
186 A set of tags that can be included with ``host_whitelist``.
187 The default is ``iframe`` and ``embed``; you may wish to
188 include other tags like ``script``, or you may want to
189 implement ``allow_embedded_url`` for more control. Set to None to
190 include all tags.
191
192 This modifies the document *in place*.
193 """
194
195 scripts = True
196 javascript = True
197 comments = True
198 style = False
199 inline_style = None
200 links = True
201 meta = True
202 page_structure = True
203 processing_instructions = True
204 embedded = True
205 frames = True
206 forms = True
207 annoying_tags = True
208 remove_tags = None
209 allow_tags = None
210 kill_tags = None
211 remove_unknown_tags = True
212 safe_attrs_only = True
213 safe_attrs = defs.safe_attrs
214 add_nofollow = False
215 host_whitelist = ()
216 whitelist_tags = set(['iframe', 'embed'])
217
226
227
228
229 _tag_link_attrs = dict(
230 script='src',
231 link='href',
232
233
234 applet=['code', 'object'],
235 iframe='src',
236 embed='src',
237 layer='src',
238
239
240
241
242
243
244
245
246 a='href',
247 )
248
250 """
251 Cleans the document.
252 """
253 if hasattr(doc, 'getroot'):
254
255 doc = doc.getroot()
256
257 xhtml_to_html(doc)
258
259
260 for el in doc.iter('image'):
261 el.tag = 'img'
262 if not self.comments:
263
264
265 self.kill_conditional_comments(doc)
266
267 kill_tags = set(self.kill_tags or ())
268 remove_tags = set(self.remove_tags or ())
269 allow_tags = set(self.allow_tags or ())
270
271 if self.scripts:
272 kill_tags.add('script')
273 if self.safe_attrs_only:
274 safe_attrs = set(self.safe_attrs)
275 for el in doc.iter(etree.Element):
276 attrib = el.attrib
277 for aname in attrib.keys():
278 if aname not in safe_attrs:
279 del attrib[aname]
280 if self.javascript:
281 if not (self.safe_attrs_only and
282 self.safe_attrs == defs.safe_attrs):
283
284 for el in doc.iter(etree.Element):
285 attrib = el.attrib
286 for aname in attrib.keys():
287 if aname.startswith('on'):
288 del attrib[aname]
289 doc.rewrite_links(self._remove_javascript_link,
290 resolve_base_href=False)
291
292
293 if not self.inline_style:
294 for el in _find_styled_elements(doc):
295 old = el.get('style')
296 new = _css_javascript_re.sub('', old)
297 new = _css_import_re.sub('', new)
298 if self._has_sneaky_javascript(new):
299
300 del el.attrib['style']
301 elif new != old:
302 el.set('style', new)
303 if not self.style:
304 for el in list(doc.iter('style')):
305 if el.get('type', '').lower().strip() == 'text/javascript':
306 el.drop_tree()
307 continue
308 old = el.text or ''
309 new = _css_javascript_re.sub('', old)
310
311 new = _css_import_re.sub('', old)
312 if self._has_sneaky_javascript(new):
313
314 el.text = '/* deleted */'
315 elif new != old:
316 el.text = new
317 if self.comments or self.processing_instructions:
318
319
320
321 kill_tags.add(etree.Comment)
322 if self.processing_instructions:
323 kill_tags.add(etree.ProcessingInstruction)
324 if self.style:
325 kill_tags.add('style')
326 if self.inline_style:
327 etree.strip_attributes(doc, 'style')
328 if self.links:
329 kill_tags.add('link')
330 elif self.style or self.javascript:
331
332
333 for el in list(doc.iter('link')):
334 if 'stylesheet' in el.get('rel', '').lower():
335
336 if not self.allow_element(el):
337 el.drop_tree()
338 if self.meta:
339 kill_tags.add('meta')
340 if self.page_structure:
341 remove_tags.update(('head', 'html', 'title'))
342 if self.embedded:
343
344
345
346 for el in list(doc.iter('param')):
347 found_parent = False
348 parent = el.getparent()
349 while parent is not None and parent.tag not in ('applet', 'object'):
350 parent = parent.getparent()
351 if parent is None:
352 el.drop_tree()
353 kill_tags.update(('applet',))
354
355 remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
356 if self.frames:
357
358
359
360 kill_tags.update(defs.frame_tags)
361 if self.forms:
362 remove_tags.add('form')
363 kill_tags.update(('button', 'input', 'select', 'textarea'))
364 if self.annoying_tags:
365 remove_tags.update(('blink', 'marquee'))
366
367 _remove = []
368 _kill = []
369 for el in doc.iter():
370 if el.tag in kill_tags:
371 if self.allow_element(el):
372 continue
373 _kill.append(el)
374 elif el.tag in remove_tags:
375 if self.allow_element(el):
376 continue
377 _remove.append(el)
378
379 if _remove and _remove[0] == doc:
380
381
382 el = _remove.pop(0)
383 el.tag = 'div'
384 el.attrib.clear()
385 elif _kill and _kill[0] == doc:
386
387
388 el = _kill.pop(0)
389 if el.tag != 'html':
390 el.tag = 'div'
391 el.clear()
392
393 _kill.reverse()
394 for el in _kill:
395 el.drop_tree()
396 for el in _remove:
397 el.drop_tag()
398
399 if self.remove_unknown_tags:
400 if allow_tags:
401 raise ValueError(
402 "It does not make sense to pass in both allow_tags and remove_unknown_tags")
403 allow_tags = set(defs.tags)
404 if allow_tags:
405 bad = []
406 for el in doc.iter():
407 if el.tag not in allow_tags:
408 bad.append(el)
409 if bad:
410 if bad[0] is doc:
411 el = bad.pop(0)
412 el.tag = 'div'
413 el.attrib.clear()
414 for el in bad:
415 el.drop_tag()
416 if self.add_nofollow:
417 for el in _find_external_links(doc):
418 if not self.allow_follow(el):
419 rel = el.get('rel')
420 if rel:
421 if ('nofollow' in rel
422 and ' nofollow ' in (' %s ' % rel)):
423 continue
424 rel = '%s nofollow' % rel
425 else:
426 rel = 'nofollow'
427 el.set('rel', rel)
428
430 """
431 Override to suppress rel="nofollow" on some anchors.
432 """
433 return False
434
436 if el.tag not in self._tag_link_attrs:
437 return False
438 attr = self._tag_link_attrs[el.tag]
439 if isinstance(attr, (list, tuple)):
440 for one_attr in attr:
441 url = el.get(one_attr)
442 if not url:
443 return False
444 if not self.allow_embedded_url(el, url):
445 return False
446 return True
447 else:
448 url = el.get(attr)
449 if not url:
450 return False
451 return self.allow_embedded_url(el, url)
452
454 if (self.whitelist_tags is not None
455 and el.tag not in self.whitelist_tags):
456 return False
457 scheme, netloc, path, query, fragment = urlsplit(url)
458 netloc = netloc.lower().split(':', 1)[0]
459 if scheme not in ('http', 'https'):
460 return False
461 if netloc in self.host_whitelist:
462 return True
463 return False
464
475
477 bad = []
478 for el in doc.iter(iterate):
479 if condition(el):
480 bad.append(el)
481 for el in bad:
482 el.drop_tree()
483
491
492 _substitute_comments = re.compile(r'/\*.*?\*/', re.S).sub
493
495 """
496 Depending on the browser, stuff like ``e x p r e s s i o n(...)``
497 can get interpreted, or ``expre/* stuff */ssion(...)``. This
498 checks for attempt to do stuff like this.
499
500 Typically the response will be to kill the entire style; if you
501 have just a bit of Javascript in the style another rule will catch
502 that and remove only the Javascript from the style; this catches
503 more sneaky attempts.
504 """
505 style = self._substitute_comments('', style)
506 style = style.replace('\\', '')
507 style = _substitute_whitespace('', style)
508 style = style.lower()
509 if 'javascript:' in style:
510 return True
511 if 'expression(' in style:
512 return True
513 return False
514
523
524 clean = Cleaner()
525 clean_html = clean.clean_html
526
527
528
529
530
531 _link_regexes = [
532 re.compile(r'(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\-_.,a-z0-9%&?;=~]*)?(?:\([/\-_.,a-z0-9%&?;=~]*\))?)', re.I),
533
534 re.compile(r'mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_.-]+[a-z]))', re.I),
535 ]
536
537 _avoid_elements = ['textarea', 'pre', 'code', 'head', 'select', 'a']
538
539 _avoid_hosts = [
540 re.compile(r'^localhost', re.I),
541 re.compile(r'\bexample\.(?:com|org|net)$', re.I),
542 re.compile(r'^127\.0\.0\.1$'),
543 ]
544
545 _avoid_classes = ['nolink']
546
551 """
552 Turn any URLs into links.
553
554 It will search for links identified by the given regular
555 expressions (by default mailto and http(s) links).
556
557 It won't link text in an element in avoid_elements, or an element
558 with a class in avoid_classes. It won't link to anything with a
559 host that matches one of the regular expressions in avoid_hosts
560 (default localhost and 127.0.0.1).
561
562 If you pass in an element, the element's tail will not be
563 substituted, only the contents of the element.
564 """
565 if el.tag in avoid_elements:
566 return
567 class_name = el.get('class')
568 if class_name:
569 class_name = class_name.split()
570 for match_class in avoid_classes:
571 if match_class in class_name:
572 return
573 for child in list(el):
574 autolink(child, link_regexes=link_regexes,
575 avoid_elements=avoid_elements,
576 avoid_hosts=avoid_hosts,
577 avoid_classes=avoid_classes)
578 if child.tail:
579 text, tail_children = _link_text(
580 child.tail, link_regexes, avoid_hosts, factory=el.makeelement)
581 if tail_children:
582 child.tail = text
583 index = el.index(child)
584 el[index+1:index+1] = tail_children
585 if el.text:
586 text, pre_children = _link_text(
587 el.text, link_regexes, avoid_hosts, factory=el.makeelement)
588 if pre_children:
589 el.text = text
590 el[:0] = pre_children
591
592 -def _link_text(text, link_regexes, avoid_hosts, factory):
593 leading_text = ''
594 links = []
595 last_pos = 0
596 while 1:
597 best_match, best_pos = None, None
598 for regex in link_regexes:
599 regex_pos = last_pos
600 while 1:
601 match = regex.search(text, pos=regex_pos)
602 if match is None:
603 break
604 host = match.group('host')
605 for host_regex in avoid_hosts:
606 if host_regex.search(host):
607 regex_pos = match.end()
608 break
609 else:
610 break
611 if match is None:
612 continue
613 if best_pos is None or match.start() < best_pos:
614 best_match = match
615 best_pos = match.start()
616 if best_match is None:
617
618 if links:
619 assert not links[-1].tail
620 links[-1].tail = text
621 else:
622 assert not leading_text
623 leading_text = text
624 break
625 link = best_match.group(0)
626 end = best_match.end()
627 if link.endswith('.') or link.endswith(','):
628
629 end -= 1
630 link = link[:-1]
631 prev_text = text[:best_match.start()]
632 if links:
633 assert not links[-1].tail
634 links[-1].tail = prev_text
635 else:
636 assert not leading_text
637 leading_text = prev_text
638 anchor = factory('a')
639 anchor.set('href', link)
640 body = best_match.group('body')
641 if not body:
642 body = link
643 if body.endswith('.') or body.endswith(','):
644 body = body[:-1]
645 anchor.text = body
646 links.append(anchor)
647 text = text[end:]
648 return leading_text, links
649
658
659 autolink_html.__doc__ = autolink.__doc__
660
661
662
663
664
665 _avoid_word_break_elements = ['pre', 'textarea', 'code']
666 _avoid_word_break_classes = ['nobreak']
667
672 """
673 Breaks any long words found in the body of the text (not attributes).
674
675 Doesn't effect any of the tags in avoid_elements, by default
676 ``<textarea>`` and ``<pre>``
677
678 Breaks words by inserting ​, which is a unicode character
679 for Zero Width Space character. This generally takes up no space
680 in rendering, but does copy as a space, and in monospace contexts
681 usually takes up space.
682
683 See http://www.cs.tut.fi/~jkorpela/html/nobr.html for a discussion
684 """
685
686
687 if el.tag in _avoid_word_break_elements:
688 return
689 class_name = el.get('class')
690 if class_name:
691 dont_break = False
692 class_name = class_name.split()
693 for avoid in avoid_classes:
694 if avoid in class_name:
695 dont_break = True
696 break
697 if dont_break:
698 return
699 if el.text:
700 el.text = _break_text(el.text, max_width, break_character)
701 for child in el:
702 word_break(child, max_width=max_width,
703 avoid_elements=avoid_elements,
704 avoid_classes=avoid_classes,
705 break_character=break_character)
706 if child.tail:
707 child.tail = _break_text(child.tail, max_width, break_character)
708
714
715 -def _break_text(text, max_width, break_character):
716 words = text.split()
717 for word in words:
718 if len(word) > max_width:
719 replacement = _insert_break(word, max_width, break_character)
720 text = text.replace(word, replacement)
721 return text
722
723 _break_prefer_re = re.compile(r'[^a-z]', re.I)
724
726 orig_word = word
727 result = ''
728 while len(word) > width:
729 start = word[:width]
730 breaks = list(_break_prefer_re.finditer(start))
731 if breaks:
732 last_break = breaks[-1]
733
734 if last_break.end() > width-10:
735
736
737 start = word[:last_break.end()]
738 result += start + break_character
739 word = word[len(start):]
740 result += word
741 return result
742