001 // Copyright (c) 2011, Mike Samuel
002 // All rights reserved.
003 //
004 // Redistribution and use in source and binary forms, with or without
005 // modification, are permitted provided that the following conditions
006 // are met:
007 //
008 // Redistributions of source code must retain the above copyright
009 // notice, this list of conditions and the following disclaimer.
010 // Redistributions in binary form must reproduce the above copyright
011 // notice, this list of conditions and the following disclaimer in the
012 // documentation and/or other materials provided with the distribution.
013 // Neither the name of the OWASP nor the names of its contributors may
014 // be used to endorse or promote products derived from this software
015 // without specific prior written permission.
016 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019 // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020 // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021 // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022 // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023 // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026 // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027 // POSSIBILITY OF SUCH DAMAGE.
028
029 package org.owasp.html.examples;
030
031 import java.io.IOException;
032 import java.io.InputStreamReader;
033 import java.util.regex.Pattern;
034
035 import org.owasp.html.Handler;
036 import org.owasp.html.HtmlPolicyBuilder;
037 import org.owasp.html.HtmlSanitizer;
038 import org.owasp.html.HtmlStreamEventReceiver;
039 import org.owasp.html.HtmlStreamRenderer;
040
041 import com.google.common.base.Charsets;
042 import com.google.common.base.Function;
043 import com.google.common.base.Throwables;
044 import com.google.common.io.CharStreams;
045
046 /**
047 * Based on the
048 * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy Slashdot example</a>.
049 * <blockquote>
050 * Slashdot (http://www.slashdot.org/) is a techie news site that allows users
051 * to respond anonymously to news posts with very limited HTML markup. Now
052 * Slashdot is not only one of the coolest sites around, it's also one that's
053 * been subject to many different successful attacks. Even more unfortunate is
054 * the fact that most of the attacks led users to the infamous goatse.cx picture
055 * (please don't go look it up). The rules for Slashdot are fairly strict: users
056 * can only submit the following HTML tags and no CSS: {@code <b>}, {@code <u>},
057 * {@code <i>}, {@code <a>}, {@code <blockquote>}.
058 * <br>
059 * Accordingly, we've built a policy file that allows fairly similar
060 * functionality. All text-formatting tags that operate directly on the font,
061 * color or emphasis have been allowed.
062 * </blockquote>
063 */
064 public class SlashdotPolicyExample {
065
066 /** A policy definition that matches the minimal HTML that Slashdot allows. */
067 public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy>
068 POLICY_DEFINITION = new HtmlPolicyBuilder()
069 .allowStandardUrlProtocols()
070 // Allow title="..." on any element.
071 .allowAttributes("title").globally()
072 // Allow href="..." on <a> elements.
073 .allowAttributes("href").onElements("a")
074 // Defeat link spammers.
075 .requireRelNofollowOnLinks()
076 // Allow lang= with an alphabetic value on any element.
077 .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}"))
078 .globally()
079 // The align attribute on <p> elements can have any value below.
080 .allowAttributes("align")
081 .matching(true, "center", "left", "right", "justify", "char")
082 .onElements("p")
083 // These elements are allowed.
084 .allowElements(
085 "a", "p", "div", "i", "b", "em", "blockquote", "tt", "strong",
086 "br", "ul", "ol", "li")
087 // Custom slashdot tags.
088 // These could be rewritten in the sanitizer using an ElementPolicy.
089 .allowElements("quote", "ecode")
090 .toFactory();
091
092 public static void main(String[] args) throws IOException {
093 if (args.length != 0) {
094 System.err.println("Reads from STDIN and writes to STDOUT");
095 System.exit(-1);
096 }
097 System.err.println("[Reading from STDIN]");
098 // Fetch the HTML to sanitize.
099 String html = CharStreams.toString(
100 new InputStreamReader(System.in, Charsets.UTF_8));
101 // Set up an output channel to receive the sanitized HTML.
102 HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
103 System.out,
104 // Receives notifications on a failure to write to the output.
105 new Handler<IOException>() {
106 public void handle(IOException ex) {
107 Throwables.propagate(ex); // System.out suppresses IOExceptions
108 }
109 },
110 // Our HTML parser is very lenient, but this receives notifications on
111 // truly bizarre inputs.
112 new Handler<String>() {
113 public void handle(String x) {
114 throw new AssertionError(x);
115 }
116 });
117 // Use the policy defined above to sanitize the HTML.
118 HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer));
119 }
120 }