Invoices and receipts are the documents that are used to record the transactions in a particular format when buying or selling of the services or goods is involved. Things have gone digital and with the popularity of online shopping, digital invoices are widely used. Processing a number of digital invoices and extracting the information manually is a complex as well as time taking process. Thus, you need a faster yet efficient way for such a case. So in this article, I am going to show you how to extract data from a PDF invoice or receipt programmatically in C#.
Workflow for Extracting Data from a PDF Invoice
The following is the workflow of how to extract the data from a PDF invoice using GroupDocs.Parser for .NET API.
- Create table parameters for extracting data from the tables.
- Create template items for extracting data from fields.
- Parse the invoice according to the given template.
- Extract the data.
The Invoice
The following is the screenshot of a sample PDF invoice that I’ll use for extracting the data. You can download this invoice from here.
The Code
- Create the template for the given invoice (read more about templates).
// Create detector parameters for "Details" table
TemplateTableParameters detailsTableParameters = new TemplateTableParameters(new Rectangle(new Point(35, 320), new Size(530, 55)), null);
// Create detector parameters for "Summary" table
TemplateTableParameters summaryTableParameters = new TemplateTableParameters(new Rectangle(new Point(330, 385), new Size(220, 65)), null);
// Create a collection of template items
TemplateItem[] templateItems = new TemplateItem[]
{
new TemplateField(new TemplateFixedPosition(new Rectangle(new Point(35, 135), new Size(100, 10))), "FromCompany"),
new TemplateField(new TemplateFixedPosition(new Rectangle(new Point(35, 150), new Size(100, 35))), "FromAddress"),
new TemplateField(new TemplateFixedPosition(new Rectangle(new Point(35, 190), new Size(150, 2))), "FromEmail"),
new TemplateField(new TemplateFixedPosition(new Rectangle(new Point(35, 250), new Size(100, 2))), "ToCompany"),
new TemplateField(new TemplateFixedPosition(new Rectangle(new Point(35, 260), new Size(100, 15))), "ToAddress"),
new TemplateField(new TemplateFixedPosition(new Rectangle(new Point(35, 290), new Size(150, 2))), "ToEmail"),
new TemplateField(new TemplateRegexPosition("Invoice Number"), "InvoiceNumber"),
new TemplateField(new TemplateLinkedPosition(
"InvoiceNumber",
new Size(200, 15),
new TemplateLinkedPositionEdges(false, false, true, false)),
"InvoiceNumberValue"),
new TemplateField(new TemplateRegexPosition("Order Number"), "InvoiceOrder"),
new TemplateField(new TemplateLinkedPosition(
"InvoiceOrder",
new Size(200, 15),
new TemplateLinkedPositionEdges(false, false, true, false)),
"InvoiceOrderValue"),
new TemplateField(new TemplateRegexPosition("Invoice Date"), "InvoiceDate"),
new TemplateField(new TemplateLinkedPosition(
"InvoiceDate",
new Size(200, 15),
new TemplateLinkedPositionEdges(false, false, true, false)),
"InvoiceDateValue"),
new TemplateField(new TemplateRegexPosition("Due Date"), "DueDate"),
new TemplateField(new TemplateLinkedPosition(
"DueDate",
new Size(200, 15),
new TemplateLinkedPositionEdges(false, false, true, false)),
"DueDateValue"),
new TemplateField(new TemplateRegexPosition("Total Due"), "TotalDue"),
new TemplateField(new TemplateLinkedPosition(
"TotalDue",
new Size(200, 15),
new TemplateLinkedPositionEdges(false, false, true, false)),
"TotalDueValue"),
new TemplateTable(detailsTableParameters, "details", null),
new TemplateTable(summaryTableParameters, "summary", null)
};
// Create a document template
Template template = new Template(templateItems);
- Parse the invoice and extract data.
// Create an instance of Parser class
using (Parser parser = new Parser("invoice.pdf"))
{
// Parse the document by the template
DocumentData data = parser.ParseByTemplate(template);
// Print all extracted data
for (int i = 0; i < data.Count; i++)
{
Console.Write(data[i].Name + ": ");
// Check if the field is a table
PageTableArea area = data[i].PageArea as PageTableArea;
if (area == null)
{
PageArea pageArea = data[i].PageArea;
Console.WriteLine(pageArea.ToString());
}
else
{
// Iterate via table rows
for (int row = 0; row < area.RowCount; row++)
{
// Iterate via table columns
for (int column = 0; column < area.ColumnCount; column++)
{
// Get the cell value
PageTextArea cellValue = area[row, column].PageArea as PageTextArea;
// Print the space between columns
if (column > 0)
{
Console.Write("\t");
}
// Print the cell value
Console.Write(cellValue == null ? "" : cellValue.Text);
}
// Print new line
Console.WriteLine();
}
}
}
}
The Output
Cheers!
Top comments (1)
Thanks for sharing! I'll try it :)