Extract and verify text from PDF with C#
PDF verification is pretty rare case in automation testing. Still it could happen.
iTextSharp
iTextSharp is a library that allows you to manipulate PDF files. We need very small of this library. It has build in reader that iterates through pages and returns only text.using iTextSharp.text.pdf;
using iTextSharp.text.pdf.parser;
using System.Text;
namespace PDFExtractor
{
public class PDFExtractor
{
public static string ExtractTextFromPDF(string pdfFileName)
{
StringBuilder result = new StringBuilder();
// Create a reader for the given PDF file
using (PdfReader reader = new PdfReader(pdfFileName))
{
// Read pages
for (int page = 1; page <= reader.NumberOfPages; page++)
{
SimpleTextExtractionStrategy strategy =
new SimpleTextExtractionStrategy();
string pageText =
PdfTextExtractor.GetTextFromPage(reader, page, strategy);
result.Append(pageText);
}
}
return result.ToString();
}
}
}