1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package de.smartics.properties.utils;
17
18 import java.io.ByteArrayOutputStream;
19 import java.io.IOException;
20
21 import org.apache.commons.lang.StringUtils;
22 import org.htmlcleaner.CleanerProperties;
23 import org.htmlcleaner.DefaultTagProvider;
24 import org.htmlcleaner.HtmlCleaner;
25 import org.htmlcleaner.ITagInfoProvider;
26 import org.htmlcleaner.JDomSerializer;
27 import org.htmlcleaner.SimpleXmlSerializer;
28 import org.htmlcleaner.TagNode;
29 import org.jdom.Document;
30
31 import de.smartics.util.lang.Arguments;
32
33
34
35
36 public final class HtmlUtils
37 {
38
39
40
41
42
43
44
45
46
47 private final String outputEncoding;
48
49
50
51
52 private final HtmlCleaner cleaner;
53
54
55
56
57
58
59
60
61
62
63 public HtmlUtils(final String outputEncoding)
64 {
65 Arguments.checkNotBlank("outputEncoding", outputEncoding);
66
67 this.outputEncoding = outputEncoding;
68
69 cleaner = new HtmlCleaner();
70 }
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88 public String clean(final String htmlFragment)
89 {
90 if (StringUtils.isBlank(htmlFragment))
91 {
92 return htmlFragment;
93 }
94 final TagNode tagNode = cleaner.clean(htmlFragment);
95 final TagNode body = tagNode.findElementByName("body", false);
96
97 final ByteArrayOutputStream out = new ByteArrayOutputStream(1024);
98 final CleanerProperties cleanerProps = createCleanerProperties();
99 try
100 {
101 final SimpleXmlSerializer serializer =
102 new SimpleXmlSerializer(cleanerProps);
103 serializer.writeToStream(body, out, outputEncoding, true);
104 final String content = out.toString(outputEncoding);
105 return content;
106 }
107 catch (final IOException e)
108 {
109 throw new IllegalStateException(
110 "Streaming error with in memory stream or encoding.", e);
111 }
112 }
113
114
115
116
117
118
119
120 public Document cleanJDom(final String htmlFragment)
121 {
122 if (StringUtils.isBlank(htmlFragment))
123 {
124 return null;
125 }
126
127 final TagNode tagNode = cleaner.clean(htmlFragment);
128 final TagNode body = tagNode.findElementByName("body", false);
129 final CleanerProperties cleanerProps = createCleanerProperties();
130 final Document document =
131 new JDomSerializer(cleanerProps, true).createJDom(body);
132 return document;
133 }
134
135 private static CleanerProperties createCleanerProperties()
136 {
137 final CleanerProperties properties = new CleanerProperties()
138 {
139
140
141 ITagInfoProvider tagInfoProvider = new DefaultTagProvider();
142
143
144
145 @Override
146 public ITagInfoProvider getTagInfoProvider()
147 {
148 return tagInfoProvider;
149 }
150 };
151
152 properties.setOmitHtmlEnvelope(true);
153 properties.setOmitDoctypeDeclaration(true);
154 properties.setOmitXmlDeclaration(true);
155 properties.setUseEmptyElementTags(true);
156
157 return properties;
158 }
159
160
161
162 }