Thanks for the advice.
We are using version 9.4.0.
It works okay if I try to save the the PDF as a Docx, but I notice bullets are not displayed next to the respective text.
eg:
• <o:p></o:p>
Overview of content types
Here is my code:
SetPdfAsposeLicence();
var pdfStream = new MemoryStream(inFile);
var pdfDocument = new Aspose.Pdf.Document(pdfStream);
var documentStream = new MemoryStream();
pdfDocument.Save(documentStream, Aspose.Pdf.SaveFormat.DocX);
byte[] docBytes = new byte[documentStream.Length];
documentStream.Position = 0;
documentStream.Read(docBytes, 0, (int)documentStream.Length);
return docBytes;
I have however been able to meet my requirements by using the code below:
private static byte[] ConvertPdfToWord(byte[] inFile)
{
var documentContents = GetPdfText(inFile);
using (var m = new MemoryStream())
{
var doc = new Aspose.Words.Document();
var builder = new DocumentBuilder(doc);
builder.Writeln(string.IsNullOrEmpty(documentContents) ? " " : documentContents);
var opt = new Aspose.Words.Saving.OoxmlSaveOptions(Aspose.Words.SaveFormat.Docx)
{
Compliance =
OoxmlCompliance
.Iso29500_2008_Transitional
};
doc.Save(m, opt);
return m.ToArray();
}
}
private static string GetPdfText(byte[] inFile)
{
SetPdfAsposeLicence();
MemoryStream documentStream = new MemoryStream(inFile);
Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(documentStream);
System.Text.StringBuilder builder = new System.Text.StringBuilder();
TextExtractionOptions textExtOptions = new
TextExtractionOptions(TextExtractionOptions.TextFormattingMode.Raw);
TextDevice textDevice = new TextDevice();
foreach (Page pdfPage in pdfDocument.Pages)
{
using (var textStream = new MemoryStream())
{
textDevice.ExtractionOptions = textExtOptions;
textDevice.Process(pdfPage, textStream);
textStream.Close();
builder.Append(Encoding.Unicode.GetString(textStream.ToArray()));
}
}
return builder.ToString();
}
Thanks and Regards
Rob