001    // Copyright (c) 2011, Mike Samuel
002    // All rights reserved.
003    //
004    // Redistribution and use in source and binary forms, with or without
005    // modification, are permitted provided that the following conditions
006    // are met:
007    //
008    // Redistributions of source code must retain the above copyright
009    // notice, this list of conditions and the following disclaimer.
010    // Redistributions in binary form must reproduce the above copyright
011    // notice, this list of conditions and the following disclaimer in the
012    // documentation and/or other materials provided with the distribution.
013    // Neither the name of the OWASP nor the names of its contributors may
014    // be used to endorse or promote products derived from this software
015    // without specific prior written permission.
016    // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
017    // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
018    // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
019    // FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
020    // COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
021    // INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
022    // BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
023    // LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
024    // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
025    // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
026    // ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
027    // POSSIBILITY OF SUCH DAMAGE.
028    
029    package org.owasp.html.examples;
030    
031    import java.io.IOException;
032    import java.io.InputStreamReader;
033    import java.util.regex.Pattern;
034    
035    import org.owasp.html.Handler;
036    import org.owasp.html.HtmlPolicyBuilder;
037    import org.owasp.html.HtmlSanitizer;
038    import org.owasp.html.HtmlStreamEventReceiver;
039    import org.owasp.html.HtmlStreamRenderer;
040    
041    import com.google.common.base.Charsets;
042    import com.google.common.base.Function;
043    import com.google.common.base.Throwables;
044    import com.google.common.io.CharStreams;
045    
046    /**
047     * Based on the
048     * <a href="http://www.owasp.org/index.php/Category:OWASP_AntiSamy_Project#Stage_2_-_Choosing_a_base_policy_file">AntiSamy Slashdot example</a>.
049     * <blockquote>
050     * Slashdot (http://www.slashdot.org/) is a techie news site that allows users
051     * to respond anonymously to news posts with very limited HTML markup. Now
052     * Slashdot is not only one of the coolest sites around, it's also one that's
053     * been subject to many different successful attacks. Even more unfortunate is
054     * the fact that most of the attacks led users to the infamous goatse.cx picture
055     * (please don't go look it up). The rules for Slashdot are fairly strict: users
056     * can only submit the following HTML tags and no CSS: {@code <b>}, {@code <u>},
057     * {@code <i>}, {@code <a>}, {@code <blockquote>}.
058     * <br>
059     * Accordingly, we've built a policy file that allows fairly similar
060     * functionality. All text-formatting tags that operate directly on the font,
061     * color or emphasis have been allowed.
062     * </blockquote>
063     */
064    public class SlashdotPolicyExample {
065    
066      /** A policy definition that matches the minimal HTML that Slashdot allows. */
067      public static final Function<HtmlStreamEventReceiver, HtmlSanitizer.Policy>
068          POLICY_DEFINITION = new HtmlPolicyBuilder()
069              .allowStandardUrlProtocols()
070              // Allow title="..." on any element.
071              .allowAttributes("title").globally()
072              // Allow href="..." on <a> elements.
073              .allowAttributes("href").onElements("a")
074              // Defeat link spammers.
075              .requireRelNofollowOnLinks()
076              // Allow lang= with an alphabetic value on any element.
077              .allowAttributes("lang").matching(Pattern.compile("[a-zA-Z]{2,20}"))
078                  .globally()
079              // The align attribute on <p> elements can have any value below.
080              .allowAttributes("align")
081                  .matching(true, "center", "left", "right", "justify", "char")
082                  .onElements("p")
083              // These elements are allowed.
084              .allowElements(
085                  "a", "p", "div", "i", "b", "em", "blockquote", "tt", "strong",
086                  "br", "ul", "ol", "li")
087              // Custom slashdot tags.
088              // These could be rewritten in the sanitizer using an ElementPolicy.
089              .allowElements("quote", "ecode")
090              .toFactory();
091    
092      public static void main(String[] args) throws IOException {
093        if (args.length != 0) {
094          System.err.println("Reads from STDIN and writes to STDOUT");
095          System.exit(-1);
096        }
097        System.err.println("[Reading from STDIN]");
098        // Fetch the HTML to sanitize.
099        String html = CharStreams.toString(
100            new InputStreamReader(System.in, Charsets.UTF_8));
101        // Set up an output channel to receive the sanitized HTML.
102        HtmlStreamRenderer renderer = HtmlStreamRenderer.create(
103            System.out,
104            // Receives notifications on a failure to write to the output.
105            new Handler<IOException>() {
106              public void handle(IOException ex) {
107                Throwables.propagate(ex);  // System.out suppresses IOExceptions
108              }
109            },
110            // Our HTML parser is very lenient, but this receives notifications on
111            // truly bizarre inputs.
112            new Handler<String>() {
113              public void handle(String x) {
114                throw new AssertionError(x);
115              }
116            });
117        // Use the policy defined above to sanitize the HTML.
118        HtmlSanitizer.sanitize(html, POLICY_DEFINITION.apply(renderer));
119      }
120    }