import java.io.File;
import java.io.FileInputStream;
import java.io.FilenameFilter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.swing.text.Document;
import javax.swing.text.rtf. RTFEditorKit;
import org.apache.poi.hwpf. HWPFDocument;
import org.apache.poi.hwpf.extractor. WordExtractor;
import org.apache.poi.hwpf.usermodel. HeaderStories;
import org.apache.poi.poifs. filesystem.POIFSFileSystem;
import org.apache.poi.xwpf.extractor. XWPFWordExtractor;
import org.apache.poi.xwpf.usermodel. XWPFDocument;
import com.itextpdf.text.pdf. PdfReader;
import com.itextpdf.text.pdf.parser. PdfTextExtractor;
public class DocumentParser1 {
private static final Logger _log = Logger.getLogger(" DocumentParser1"); //creating Logger object
public static void main(String[] args) {
_log.info("##### Entered into main method ");
// String fileName =
// "C:\\google drive\\mroads_resumes\\ Paryank.Kansara.5719219261. pdf";
String strResumeDrive = "C:\\google drive\\mroads_resumes";
List<ResumeInfo> expectedValues = new ArrayList();
List<ResumeInfo> actualValues = new ArrayList();
File[] resumes=null;
try {
File resumeFolder = new File(strResumeDrive);
FilenameFilter textFilter = new FilenameFilter() {
public boolean accept(File dir, String name) {
name = name.toLowerCase();
return (name.endsWith(".docx") || name.endsWith(".doc") || name
.endsWith(".pdf") || name.endsWith(".rtf"));
}
};
resumes = resumeFolder.listFiles( textFilter);
for (File resume : resumes) {
_log.info("##### resume "+resume);
ResumeInfo expected = new ResumeInfo();
ResumeInfo actual = new ResumeInfo();
String resumePath = resume.getAbsolutePath();
_log.info("##### resumePath "+resumePath);
System.out.println("path name of resume is :" + resumePath);
expected.setResumePath( resumePath);
actual.setResumePath( resumePath);
getExpectedValues(expected);
expectedValues.add(expected);
getActualValues(actual);
actualValues.add(actual);
}
} catch (Exception e) {
e.printStackTrace();
}
_log.info("##### Total number of resumes "+resumes.length);
_log.info("##### Total number of resumes parsed "+actualValues.size());
System.out.println("Total number of resumes : "+resumes.length);
System.out.println("Total number of resumes parsed: "+actualValues.size());
printResults(expectedValues, actualValues);
/** Method call to read the document (demonstrate some useage of POI) **/
_log.info("##### End of main method ###### ");
}
public static void getActualValues(ResumeInfo res){
_log.info("##### entered into getActualValues method #####");
try {
getResumeContent(res);
parseEmail(res);
parseContact(res);
parseName(res);
} catch (Exception e) {
// TODO Auto-generated catch block
_log.info("##### Exception while getting ActualValues");
e.printStackTrace();
}
_log.info("##### Exit from getActualValues method #####");
}
// get expected values from file name
public static void getExpectedValues(ResumeInfo res) {
_log.info("##### entered into getExpectedValues method #######");
String filename = null;
String fname_exp = null;
String lname_exp = null;
String pNumber_exp = null;
// extract filename from the path
try {
Pattern pattern = Pattern
.compile("(\\w{1,25})\\.(\\w{ 1,25})\\.(\\d{1,10})");
Matcher matcher = pattern.matcher(res. getResumePath());
if (matcher.find()) {
_log.info("##### pattern is matched");
filename = matcher.group(0);
_log.info("##### Match fileName is :"+filename);
System.out.println("matched filename :" + filename);
}
String[] data = filename.split("\\.");
// set values to ResumeInfo object
res.setFirstName(data[0]);
res.setLastName(data[1]);
res.setPhoneNumber(data[2]);
res.setFileName(filename);
} catch (Exception e) {
// TODO Auto-generated catch block
_log.info("##### Exception while set values to object");
e.printStackTrace();
}
_log.info("##### Exit from getExpectedValues method #######");
}
// Getting resume content from resume path
public static void getResumeContent(ResumeInfo res) {
_log.info("##### entered into getResumeContent method ######");
String resumePath = res.getResumePath();
String[] lines = null;
String fileEx = fileExtension(resumePath);
_log.info("##### File extention is "+fileEx);
// if condition for match to docx file
if (fileEx.equals("docx")) {
_log.info("##### docX file uploaded "+resumePath);
try{
FileInputStream fs = new FileInputStream(resumePath);
XWPFDocument doc1 = new XWPFDocument(fs);
lines = getParagraphsFromResume(doc1);
//_log.info("##### lines are :"+lines);
}
catch(Exception e){
_log.info("##### Exception while creating XWPFDocument object using FileInputStream "+e.getMessage());
e.printStackTrace();
}
}
// else if condition for match to doc file
else if (fileEx.equals("doc"))
{
System.out.println("doc file uploaded");
_log.info("##### doc file uploaded "+resumePath);
try {
POIFSFileSystem fs = new POIFSFileSystem(new FileInputStream(
resumePath));
System.out.println("test1");
HWPFDocument doc = new HWPFDocument(fs);
System.out.println("test1");
lines = getParagraphsFromResume(doc);
System.out.println("test1");
_log.info("##### lines are :"+lines);
} catch (Exception e) {
// TODO Auto-generated catch block
_log.info("##### Exception while creating HWPFDocument object using FileInputStream "+e.getMessage());
e.printStackTrace();
}
}
// else if condition for match to pdf file
else if (fileEx.equals("pdf")) {
System.out.println(" pdf File uploaded");
_log.info("##### pdf file uploaded "+resumePath);
try {
FileInputStream fs = new FileInputStream(resumePath);
System.out.println(" filestream");
PdfReader pdDoc = new PdfReader(fs);
lines = getParagraphsFromResume(pdDoc) ;
_log.info("##### lines are :"+lines);
} catch (Exception e) {
// TODO Auto-generated catch block
_log.info("##### Exception while creating PdfReader object using FileInputStream "+e.getMessage());
e.printStackTrace();
}
}
// else if condition for match to rtf file
else if(fileEx.equals("rtf"))
{
System.out.println(" rtf File uploaded");
_log.info("##### rtf file uploaded "+resumePath);
try {
FileInputStream fs = new FileInputStream(resumePath);
System.out.println(" filestream");
RTFEditorKit rtfParser = new RTFEditorKit();
Document document = rtfParser. createDefaultDocument();
rtfParser.read(fs, document, 0);
lines = document.getText(0, document.getLength()).split("\ \n");
_log.info("##### lines are :"+lines);
System.out.println("lines :"+lines);
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// else condition for no matches
else {
System.out.println("wrong format");
}
_log.info("##### End of getResumeContent Method ######");
res.setResumeContent(lines);
/** Read the content **/
}
// get lines from resumes for doc type files
public static String[] getParagraphsFromResume(Object doc) {
_log.info("##### entered into getParagraphsFromResume method #######");
WordExtractor we = null;
XWPFWordExtractor xwe = null;
String[] LinesInDocument = null;
if (doc instanceof HWPFDocument) {
_log.info("##### Document is instanceof HWPDFDocument object");
try {
we = new WordExtractor((HWPFDocument) doc);
LinesInDocument = we.getText().split("\\n");
_log.info("##### Extracted lines from doc file");
} catch (Exception e) {
_log.info("##### Exception while Extracting text from docx file "+e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if (doc instanceof XWPFDocument) {
_log.info("##### Document is instanceof XWPDFDocument object");
try {
xwe = new XWPFWordExtractor(( XWPFDocument) doc);
LinesInDocument = xwe.getText().split("\\n");
_log.info("##### Extracted lines from docx file");
} catch (Exception e) {
_log.info("##### Exception while Extracting text from doc file "+e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
}
} else if (doc instanceof PdfReader) {
_log.info("##### Document is instanceof PdfReader object");
try {
LinesInDocument = PdfTextExtractor. getTextFromPage((PdfReader) doc,
1).split("\\n");
_log.info("##### Extracted lines from pdf file");
} catch (IOException e) {
_log.info("##### Exception while Extracting text from pdf file "+e.getMessage());
// TODO Auto-generated catch block
e.printStackTrace();
}
}
_log.info("##### End of getParagraphsFromResume method #######");
return LinesInDocument;
}
// read header
public static void readHeader(HWPFDocument doc, int pageNumber) {
HeaderStories headerStore = new HeaderStories(doc);
String header = headerStore.getHeader( pageNumber);
System.out.println("Header Is: " + header);
}
// read footer
public static void readFooter(HWPFDocument doc, int pageNumber) {
HeaderStories headerStore = new HeaderStories(doc);
String footer = headerStore.getFooter( pageNumber);
System.out.println("Footer Is: " + footer);
}
/* method for find document extension */
public static String fileExtension(String fileName) {
_log.info("##### Entered into fileExtentsion Method ########");
String extension = fileName.substring(fileName. lastIndexOf(".") + 1,
fileName.length());
_log.info("##### File Extension is "+extension);
System.out.println("returning extension " + extension);
_log.info("##### End of fileExtension method ######## ");
return extension;
}
/* parsing email from file lines */
public static void parseEmail(ResumeInfo res) {
_log.info("##### Entered into parseEmail method ###### ");
String email = null;
String[] resumePara = res.getResumeContent();
String regex = "[A-Z0-9._%+-]+@[A-Z0-9.-]+\\. [A-Z]{2,4}";
Pattern pattern = Pattern.compile(regex);
for (int i=0;i<resumePara.length;i++) {
String Line=resumePara[i];
//_log.info("##### Line["+(i+1)+"] : "+Line);
//System.out.println("Line["+( i+1)+"] : "+Line);
String noOfWaysToSplitExp = "[^a-zA-Z0-9@._%+-]";
String[] innerSubStrings = Line.split(noOfWaysToSplitExp) ;
for (int j=0;j<innerSubStrings.length; j++) {
String subLine=innerSubStrings[j];
//_log.info("##### SubLine["+(j+1)+"] : "+subLine);
//System.out.println("\ tSubLine["+(j+1)+"] : "+subLine);
Matcher matcher = pattern.matcher(subLine. toUpperCase());
if (matcher.matches()) {
_log.info("##### Match found ");
_log.info("##### Email is "+subLine +"");
System.out.println("email is " + subLine + "");
email = subLine;
break;
}
}
}
res.setEmail(email);
}
/* parsing contact from file lines */
public static void parseContact(ResumeInfo res)
{
_log.info("##### Entered into parseContact method ###### ");
String phoneNumber = null;
String[] resumePara = res.getResumeContent();
String regex = "[+\\d{3}]?[(]?[+]?[[\\d]{1,3} ]?[)]?[(]?[-+.\\s]?[(]?\\d{3}[ )]?[-.\\s]?\\d{3,4}\\s?[-.\\s] ?\\s?\\d{3,4}[)]?[.]?";
//String regex="[(]?\\d{3}?[)]?\\s\\d{ 3}\\s\\d{3}[-]?\\d{4}";
Pattern pattern = Pattern.compile(regex);
String noOfWaysToSplitExp = "[^0-9-.+\\s\\(\\)]";
for (int i=0;i<resumePara.length;i++) {
String Line=resumePara[i].replaceAll( "[a-zA-Z]+[-.:]", "\\n").replaceAll("[(]+[a-zA- Z]+[)]", "\n");
//_log.info("##### Line["+(i+1)+"] : "+Line);
//System.out.println("Line["+( i+1)+"] : "+Line);
String[] innerSubStrings = Line.split(noOfWaysToSplitExp) ;
for (int j=0;j<innerSubStrings.length; j++) {
String subLine=innerSubStrings[j];
//_log.info("##### SubLine["+(j+1)+"] : "+subLine);
//System.out.println("\ tSubLine["+(j+1)+"] : "+subLine.trim());
Matcher matcher = pattern.matcher(subLine.trim() );
if (matcher.matches()) {
_log.info("##### Match found ");
_log.info("##### PhoneNumber is "+subLine +"");
System.out.println("Phone Number is " + subLine + "");
phoneNumber = subLine;
break;
}
}
}
res.setPhoneNumber( phoneNumber);
}
/* parsing name from file lines */
public static void parseName(ResumeInfo res)
{
String firstName=null;
String lastName=null;
String middleName=null;
String[] resumePara = res.getResumeContent();
try{
//String Line=resumePara[0].trim();
for(int i=0;i<=3;i++){
String Line=resumePara[i].trim();
System.out.println("Line :"+Line);
String regex="\\w+\\s*?\\w+?[.\\s*]?\ \w+?";
Pattern pattern = Pattern.compile(regex);
Matcher matcher = pattern.matcher(Line);
if (matcher.matches()) {
_log.info("##### Match found ");
//_log.info("##### name is "+Line +"");
System.out.println("name is " + Line + "");
try{
String[] subLine=Line.split("[.\\s]");
if(subLine.length<=2){
firstName=subLine[0];
lastName=subLine[1];
System.out.println("first Name -"+firstName);
}
else{
firstName=subLine[0];
middleName=subLine[1];
lastName=subLine[2];
System.out.println("first Name -"+firstName);
}
break;
}
catch(Exception E){}
System.out.println("First Name :"+firstName+" Middle Name :"+middleName+"Last Name :"+lastName);
}
}}catch(Exception e){e.printStackTrace();}
res.setFirstName(firstName);
res.setLastName(lastName);
res.setMiddleName(middleName);
}
// prints the results
public static void printResults(List<ResumeInfo> expectedValues,
List<ResumeInfo> actualValues) {
_log.info("##### Entered into printResults method ###### ");
int Count=0;
System.out.println("Printing results\n\n\n\n\n");
System.out.println("Success full results :\n\n");
System.out.println("--------+- ------------------------------ -------------------------+---- ------------------------------ -------------");
//System.out.format("%4s%50s% 50s%50s", "no","resumeName" , "Email" ,"PhoneNumber");
System.out.format("%4s%50s% 30s%30s%30s", "no" ,"fileName","FirstName" ,"Middle Name" ,"LastName");
System.out.println();
// loop for printing name and email
for (int i = 0; i < expectedValues.size(); i++) {
try {
String email=actualValues.get(i). getEmail();
String phoneNumber=actualValues.get( i).getPhoneNumber();
String firstName=actualValues.get(i). getFirstName();
String middleName=actualValues.get(i) .getMiddleName();
String lastName=actualValues.get(i). getLastName();
String fileName=expectedValues.get(i) .getFileName();
if(firstName!=null){
Count++;
//System.out.format("%4d%50s% 50s%50s", i+1, fileName,
//email,phoneNumber );
System.out.format("%4s%50s% 30s%30s%30s",i+1 ,fileName,firstName, middleName,lastName);
System.out.println("\n");
}
} catch (Exception e) {
_log.info("##### Exception while printing results ");
e.printStackTrace();
}
}
System.out.println("--------+- ------------------------------ -------------------------+---- ------------------------------ -------------");
System.out.println("\n\nCount of phoneNumbers parsed: "+ Count);
System.out.println("Count of PhoneNumbers not parsed: "+ (actualValues.size()-Count));
}
}
No comments:
Post a Comment