1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
|
import org.apache.commons.lang.StringEscapeUtils;
import org.htmlcleaner.*;
class ProkopovichRegexPattern {
public static void main(String[] args){
String description = "<div class="field field-name-field-what-sell field-type-taxonomy-term-reference field-label-hidden"><div class="field-items"><div class="field-item even"><a href="/currency/%D0%B5%D0%B2%D1%80%D0%BE" typeof="skos:Concept" property="rdfs:label skos:prefLabel">Евро</a></div></div></div><div class="field field-name-post-date field-type-ds field-label-hidden"><div class="field-items"><div class="field-item even">16/09/2011 - 15:24</div></div></div><div class="field field-name-field-you field-type-taxonomy-term-reference field-label-hidden"><div class="field-items"><div class="field-item even"><a href="/%D1%81%D1%83%D0%B1%D1%8A%D0%B5%D0%BA%D1%82%D1%8B-%D1%85%D0%BE%D0%B7%D1%8F%D0%B9%D1%81%D1%82%D0%B2%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D1%8F/%D1%84%D0%B8%D0%B7-%D0%BB%D0%B8%D1%86%D0%BE" typeof="skos:Concept" property="rdfs:label skos:prefLabel">Физ. лицо</a></div></div></div><div class="field field-name-field-address field-type-taxonomy-term-reference field-label-hidden"><div class="field-items"><div class="field-item even"><a href="/geo/%D0%BC%D0%B8%D0%BD%D1%81%D0%BA" typeof="skos:Concept" property="rdfs:label skos:prefLabel">Минск</a></div></div></div><div class="field field-name-field-amount field-type-number-integer field-label-hidden"><div class="field-items"><div class="field-item even">230</div></div></div><div class="field field-name-field-price field-type-number-integer field-label-hidden"><div class="field-items"><div class="field-item even">11 900 Br.</div></div></div>";
// Unescape HTML
String unescapedDescription = StringEscapeUtils.unescapeHtml(description);
// Parse HTML
HtmlCleaner cleaner = new HtmlCleaner();
TagNode root = cleaner.clean(unescapedDescription);
// Get text for div with class 'field-name-field-what-sell'
System.out.println (getTagContentsByClass(root, "field-name-field-what-sell"));
System.out.println (getTagContentsByClass(root, "field-name-post-date"));
System.out.println (getTagContentsByClass(root, "field-name-field-you"));
System.out.println (getTagContentsByClass(root, "field-name-field-address"));
System.out.println (getTagContentsByClass(root, "field-name-field-amount"));
System.out.println (getTagContentsByClass(root, "field-name-field-price"));
}
public static String getTagContentsByClass(TagNode root, String className){
String tagContents = null;
TagNode divElements[] = root.getElementsByName("div", true);
for (int i=0; divElements != null && i < divElements.length; i++){
String classType = divElements[i].getAttributeByName("class");
if (classType != null && classType.contains(className)){
tagContents = divElements[i].getText().toString();
break;
}
}
return tagContents;
}
}
|