View Javadoc

1   /*
2    * Copyright 2012-2013 smartics, Kronseder & Reiner GmbH
3    *
4    * Licensed under the Apache License, Version 2.0 (the "License");
5    * you may not use this file except in compliance with the License.
6    * You may obtain a copy of the License at
7    *
8    *     http://www.apache.org/licenses/LICENSE-2.0
9    *
10   * Unless required by applicable law or agreed to in writing, software
11   * distributed under the License is distributed on an "AS IS" BASIS,
12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13   * See the License for the specific language governing permissions and
14   * limitations under the License.
15   */
16  package de.smartics.properties.utils;
17  
18  import java.io.ByteArrayOutputStream;
19  import java.io.IOException;
20  
21  import org.apache.commons.lang.StringUtils;
22  import org.htmlcleaner.CleanerProperties;
23  import org.htmlcleaner.DefaultTagProvider;
24  import org.htmlcleaner.HtmlCleaner;
25  import org.htmlcleaner.ITagInfoProvider;
26  import org.htmlcleaner.JDomSerializer;
27  import org.htmlcleaner.SimpleXmlSerializer;
28  import org.htmlcleaner.TagNode;
29  import org.jdom.Document;
30  
31  import de.smartics.util.lang.Arguments;
32  
33  /**
34   * Utilities to deal with HTML.
35   */
36  public final class HtmlUtils
37  {
38    // ********************************* Fields *********************************
39  
40    // --- constants ------------------------------------------------------------
41  
42    // --- members --------------------------------------------------------------
43  
44    /**
45     * The encoding used write.
46     */
47    private final String outputEncoding;
48  
49    /**
50     * The cleaner instance to use.
51     */
52    private final HtmlCleaner cleaner;
53  
54    // ****************************** Initializer *******************************
55  
56    // ****************************** Constructors ******************************
57  
58    /**
59     * Default constructor.
60     *
61     * @param outputEncoding the encoding used write.
62     */
63    public HtmlUtils(final String outputEncoding)
64    {
65      Arguments.checkNotBlank("outputEncoding", outputEncoding);
66  
67      this.outputEncoding = outputEncoding;
68  
69      cleaner = new HtmlCleaner();
70    }
71  
72    // ****************************** Inner Classes *****************************
73  
74    // ********************************* Methods ********************************
75  
76    // --- init -----------------------------------------------------------------
77  
78    // --- get&set --------------------------------------------------------------
79  
80    // --- business -------------------------------------------------------------
81  
82    /**
83     * Cleans up the HTML fragment.
84     *
85     * @param htmlFragment the dirty fragment.
86     * @return the cleaned up fragment.
87     */
88    public String clean(final String htmlFragment)
89    {
90      if (StringUtils.isBlank(htmlFragment))
91      {
92        return htmlFragment;
93      }
94      final TagNode tagNode = cleaner.clean(htmlFragment);
95      final TagNode body = tagNode.findElementByName("body", false);
96  
97      final ByteArrayOutputStream out = new ByteArrayOutputStream(1024);
98      final CleanerProperties cleanerProps = createCleanerProperties();
99      try
100     {
101       final SimpleXmlSerializer serializer =
102           new SimpleXmlSerializer(cleanerProps);
103       serializer.writeToStream(body, out, outputEncoding, true);
104       final String content = out.toString(outputEncoding);
105       return content;
106     }
107     catch (final IOException e)
108     {
109       throw new IllegalStateException(
110           "Streaming error with in memory stream or encoding.", e);
111     }
112   }
113 
114   /**
115    * Cleans up the HTML fragment.
116    *
117    * @param htmlFragment the dirty fragment.
118    * @return the cleaned up fragment.
119    */
120   public Document cleanJDom(final String htmlFragment)
121   {
122     if (StringUtils.isBlank(htmlFragment))
123     {
124       return null;
125     }
126 
127     final TagNode tagNode = cleaner.clean(htmlFragment);
128     final TagNode body = tagNode.findElementByName("body", false);
129     final CleanerProperties cleanerProps = createCleanerProperties();
130     final Document document =
131         new JDomSerializer(cleanerProps, true).createJDom(body);
132     return document;
133   }
134 
135   private static CleanerProperties createCleanerProperties()
136   {
137     final CleanerProperties properties = new CleanerProperties()
138     {
139 
140       // CHECKSTYLE:OFF analogous to the implementation that gets fixed here.
141       ITagInfoProvider tagInfoProvider = new DefaultTagProvider(); // NOPMD
142 
143       // CHECKSTYLE:ON
144 
145       @Override
146       public ITagInfoProvider getTagInfoProvider()
147       {
148         return tagInfoProvider;
149       }
150     };
151 
152     properties.setOmitHtmlEnvelope(true);
153     properties.setOmitDoctypeDeclaration(true);
154     properties.setOmitXmlDeclaration(true);
155     properties.setUseEmptyElementTags(true);
156 
157     return properties;
158   }
159 
160   // --- object basics --------------------------------------------------------
161 
162 }