~roignac/+junk/prokopovich_parser

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
import org.apache.commons.lang.StringEscapeUtils;
import org.htmlcleaner.*;

class ProkopovichRegexPattern {
    
    public static void main(String[] args){
        String description = "<div class="field field-name-field-what-sell field-type-taxonomy-term-reference field-label-hidden"><div class="field-items"><div class="field-item even"><a href="/currency/%D0%B5%D0%B2%D1%80%D0%BE" typeof="skos:Concept" property="rdfs:label skos:prefLabel">Евро</a></div></div></div><div class="field field-name-post-date field-type-ds field-label-hidden"><div class="field-items"><div class="field-item even">16/09/2011 - 15:24</div></div></div><div class="field field-name-field-you field-type-taxonomy-term-reference field-label-hidden"><div class="field-items"><div class="field-item even"><a href="/%D1%81%D1%83%D0%B1%D1%8A%D0%B5%D0%BA%D1%82%D1%8B-%D1%85%D0%BE%D0%B7%D1%8F%D0%B9%D1%81%D1%82%D0%B2%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D1%8F/%D1%84%D0%B8%D0%B7-%D0%BB%D0%B8%D1%86%D0%BE" typeof="skos:Concept" property="rdfs:label skos:prefLabel">Физ. лицо</a></div></div></div><div class="field field-name-field-address field-type-taxonomy-term-reference field-label-hidden"><div class="field-items"><div class="field-item even"><a href="/geo/%D0%BC%D0%B8%D0%BD%D1%81%D0%BA" typeof="skos:Concept" property="rdfs:label skos:prefLabel">Минск</a></div></div></div><div class="field field-name-field-amount field-type-number-integer field-label-hidden"><div class="field-items"><div class="field-item even">230</div></div></div><div class="field field-name-field-price field-type-number-integer field-label-hidden"><div class="field-items"><div class="field-item even">11 900 Br.</div></div></div>";
        
        // Unescape HTML
        String unescapedDescription =  StringEscapeUtils.unescapeHtml(description);
        
        // Parse HTML
        HtmlCleaner cleaner = new HtmlCleaner();
        TagNode root = cleaner.clean(unescapedDescription);
        
        // Get text for div with class 'field-name-field-what-sell'
        System.out.println (getTagContentsByClass(root, "field-name-field-what-sell"));
        System.out.println (getTagContentsByClass(root, "field-name-post-date"));
        System.out.println (getTagContentsByClass(root, "field-name-field-you"));
        System.out.println (getTagContentsByClass(root, "field-name-field-address"));
        System.out.println (getTagContentsByClass(root, "field-name-field-amount"));
        System.out.println (getTagContentsByClass(root, "field-name-field-price"));
    }
    
    public static String getTagContentsByClass(TagNode root, String className){
        String tagContents = null;
        TagNode divElements[] = root.getElementsByName("div", true);
        for (int i=0; divElements != null && i < divElements.length; i++){
            String classType = divElements[i].getAttributeByName("class");
            if (classType != null && classType.contains(className)){
                tagContents = divElements[i].getText().toString();
                break;
            }
        }
        return tagContents;
    }
}