2014-05-14 13 views

risposta

1

Ecco un esempio:

public String[] tika_autoParser() { 
    String[] result = new String[3]; 
    try { 
     InputStream input = new FileInputStream(new File("/Users/nazanin/Books/Web crawler.pdf")); 
     ContentHandler textHandler = new BodyContentHandler(); 
     Metadata metadata = new Metadata(); 
     AutoDetectParser parser = new AutoDetectParser(); 
     ParseContext context = new ParseContext(); 
     parser.parse(input, textHandler, metadata, context); 
     result[0] = "Title: " + metadata.get(metadata.TITLE); 
     result[1] = "Body: " + textHandler.toString(); 
    } catch (FileNotFoundException e) { 
     e.printStackTrace(); 
    } catch (IOException e) { 
     e.printStackTrace(); 
    } catch (SAXException e) { 
     e.printStackTrace(); 
    } catch (TikaException e) { 
     e.printStackTrace(); 
    } 

    return result; 
} 
4

La classe BodyContentHandler non utilizzare il codice Boilerpipe, quindi dovrete utilizzare in modo esplicito il BoilerPipeContentHandler. Il seguente codice ha funzionato per me:

public String[] tika_autoParser() { 
    String[] result = new String[3]; 
    try { 
     InputStream input = new FileInputStream(new File("test.html")); 
     ContentHandler textHandler = new BodyContentHandler(); 
     Metadata metadata = new Metadata(); 
     AutoDetectParser parser = new AutoDetectParser(); 
     ParseContext context = new ParseContext(); 
     parser.parse(input, new BoilerpipeContentHandler(textHandler), metadata, context); 
     result[0] = "Title: " + metadata.get(metadata.TITLE); 
     result[1] = "Body: " + textHandler.toString(); 
    } catch (FileNotFoundException e) { 
     e.printStackTrace(); 
    } catch (IOException e) { 
     e.printStackTrace(); 
    } catch (SAXException e) { 
     e.printStackTrace(); 
    } catch (TikaException e) { 
     e.printStackTrace(); 
    } 

    return result; 
}