java - extract images from pdf using pdfbox -

July 15, 2012

i m trying extract images pdf using pdfbox. example pdf here

but m getting blank images only.

the code m trying:-

public static void main(string[] args) {    pdfimageextract obj = new pdfimageextract();     try {         obj.read_pdf();     } catch (ioexception ex) {         system.out.println("" + ex);     }  }   void read_pdf() throws ioexception {     pddocument document = null;      try {         document = pddocument.load("c:\\users\\pradyut\\documents\\mcs-034.pdf");     } catch (ioexception ex) {         system.out.println("" + ex);     }     list pages = document.getdocumentcatalog().getallpages();     iterator iter = pages.iterator();      int =1;     string name = null;      while (iter.hasnext()) {         pdpage page = (pdpage) iter.next();         pdresources resources = page.getresources();         map pageimages = resources.getimages();         if (pageimages != null) {              iterator imageiter = pageimages.keyset().iterator();             while (imageiter.hasnext()) {                 string key = (string) imageiter.next();                 pdxobjectimage image = (pdxobjectimage) pageimages.get(key);                 image.write2file("c:\\users\\pradyut\\documents\\image" + i);                 ++;             }         }     }  }

thanks

the below getimagesfrompdf java class images in 04-request-headers.pdf file , save files destination folder pdfcopy.

import java.io.file; import java.util.iterator; import java.util.list; import java.util.map;  import org.apache.pdfbox.pdmodel.pddocument; import org.apache.pdfbox.pdmodel.pdpage; import org.apache.pdfbox.pdmodel.pdresources; import org.apache.pdfbox.pdmodel.graphics.xobject.pdxobjectimage;  @suppresswarnings({ "unchecked", "rawtypes", "deprecation" }) public class getimagesfrompdf {     public static void main(string[] args) {         try {             string sourcedir = "c:/pdfcopy/04-request-headers.pdf";// paste pdf files in pdfcopy folder read             string destinationdir = "c:/pdfcopy/";             file oldfile = new file(sourcedir);             if (oldfile.exists()) {             pddocument document = pddocument.load(sourcedir);              list<pdpage> list = document.getdocumentcatalog().getallpages();              string filename = oldfile.getname().replace(".pdf", "_cover");             int totalimages = 1;             (pdpage page : list) {                 pdresources pdresources = page.getresources();                  map pageimages = pdresources.getimages();                 if (pageimages != null) {                      iterator imageiter = pageimages.keyset().iterator();                     while (imageiter.hasnext()) {                         string key = (string) imageiter.next();                         pdxobjectimage pdxobjectimage = (pdxobjectimage) pageimages.get(key);                         pdxobjectimage.write2file(destinationdir + filename+ "_" + totalimages);                         totalimages++;                     }                 }             }         } else {             system.err.println("file not exists");         }     } catch (exception e) {         e.printstacktrace();     } }

}

Search This Blog

RT

java - extract images from pdf using pdfbox -

Comments

Post a Comment

Popular posts from this blog

python - Selenium remoteWebDriver (& SauceLabs) Firefox moseMoveTo action exception -

html - How to custom Bootstrap grid height? -

javascript - pass values from mssql to views in node -