[POC Sineo local links issue] HTML to PDF transformation (using WORD DLL and PDF DLL)

Hello,

during my HTML to PDF transformation tests I didn’t manage to use local links from my HTML file ‘bcl_1948506000_corrected.htm’ in zip file ‘input.zip’. It works fine in the HTML file but not in PDF with href attribute in <a> tag and linked to id attribute of my data.

Could you investigate on it ?

First in your class define :

public Document DocxFile { get; set; }
public MemoryStream StreamOut { get; set; }
public MemoryStream StreamOutPdf { get; set; }


Then below the code used to transform HTML to PDF :

Dictionary<String, MemoryStream> htmlFiles = new Dictionary<String, MemoryStream>();
//Unziping
ZipFile zip = new ZipFile("");
foreach (ZipEntry zipEntry in zip)
{
	byte[] ret = null;
	Stream entryStream = zip.GetInputStream(zipEntry);
	ret = new byte[zipEntry.Size];
	entryStream.Read(ret, 0, ret.Length);

	if (!zipEntry.IsDirectory)
	{
		String fileName = zipEntry.Name;
		if (fileName.ToUpper().EndsWith(".HTML") || fileName.ToUpper().EndsWith(".HTM"))
		{
			MemoryStream memStream = new MemoryStream();
			memStream.Write(ret, 0, ret.Length);
			htmlFiles.Add(ZipEntry.CleanName(zipEntry.Name), memStream);
		}
	}
}
// Conversion of HTML file
if (htmlFiles.Count > 0)
{
	foreach (string key in htmlFiles.Keys)
	{
		using (MemoryStream newHtmlDoc = htmlFiles[key])
		{
			Aspose.Words.LoadOptions opt = new Aspose.Words.LoadOptions();
			Aspose.Words.DocumentBuilder builder = new Aspose.Words.DocumentBuilder();
			opt.LoadFormat = Aspose.Words.LoadFormat.Html;
			opt.Encoding = Encoding.UTF8;

			Aspose.Words.Document doc = new Aspose.Words.Document(newHtmlDoc, opt);
			builder.Document = doc;
			builder.PageSetup.Orientation = Orientation.Landscape;
			builder.PageSetup.PaperSize = PaperSize.A4;
			builder.PageSetup.LeftMargin = 0;
			builder.PageSetup.RightMargin = 0;

			builder.Document.Range.Replace(ControlChar.NonBreakingSpace, " ", false, false);
			//Margin reducing
			foreach (Section sec in doc.Sections)
			{
				sec.PageSetup.TopMargin = 8;
				sec.PageSetup.BottomMargin = 8;
			}
			this.StreamOut = new MemoryStream();
			this.StreamOutPdf = new MemoryStream();
			this.DocxFile = forceImageSize(doc);
			ToPdf();
		}
	}
}

Below the forceImageSize(Document doc) :

protected Document forceImageSize(Document aDoc)
{
	NodeCollection shapes = aDoc.GetChildNodes(NodeType.Shape, true);
	foreach (Aspose.Words.Drawing.Shape shape in shapes)
	{
		if (shape.HasImage)
		{
			shape.Width = shape.ImageData.ImageSize.WidthPoints;
			shape.Height = shape.ImageData.ImageSize.HeightPoints;
		}
	}
	return aDoc;
}

Below the function ToPdf() :

private void ToPdf()
{
	try
	{
		PdfSaveOptions options = new PdfSaveOptions();
		options.ImageCompression = PdfImageCompression.Jpeg;
		options.JpegQuality = 100;
		options.FontEmbeddingMode = PdfFontEmbeddingMode.EmbedNone;
		
		//Retrieve embedded files from docx file.
		Dictionary<String, MemoryStream> embeddedFiles = ExtractEmbeddedFiles();
		try
		{
			((Aspose.Words.Document)this.DocxFile).Save(this.StreamOut, (Aspose.Words.Saving.PdfSaveOptions)options);
		}
		catch (Exception ex)
		{
			logger.Error("Error on ASPOSE PDF transformation.", ex);
			throw ex;
		}

		// Embedded file saving in PDF if needed
		if (embeddedFiles.Count > 0)
		{
			Aspose.Pdf.Document pdfDocument = new Aspose.Pdf.Document(StreamOut);
			ImportFilesToPdf(pdfDocument, embeddedFiles);
			((Aspose.Pdf.Document)pdfDocument).Save(this.StreamOutPdf);
		}
	}
	catch (Exception globalEx)
	{
		logger.Error("Error in HTML conversion method.", globalEx);
		throw globalEx;
	}
}

Below the function ExtractEmbeddedFiles() :

private Dictionary<String, MemoryStream> ExtractEmbeddedFiles()
{
	Dictionary<String, MemoryStream> oleStreamDictionary = new Dictionary<String, MemoryStream>();
	Aspose.Words.NodeCollection shapes = this.DocxFile.GetChildNodes(NodeType.Shape, true);
	int i = 0;
	foreach (Aspose.Words.Drawing.Shape shape in shapes)
	{
		if (shape.OleFormat != null)
		{
			String filename = "";
			if (String.IsNullOrEmpty(shape.OleFormat.SuggestedFileName))
			{
				filename += "OLE_OBJECT_NO_" + (i + 1) + shape.OleFormat.SuggestedExtension;
			}
			else
			{
				filename += shape.OleFormat.SuggestedFileName;
				if (!filename.EndsWith(shape.OleFormat.SuggestedExtension))
				{
					filename += shape.OleFormat.SuggestedExtension;
				}
			}
			MemoryStream ms = new MemoryStream();
			shape.OleFormat.Save(ms);
			oleStreamDictionary.Add(filename, ms);
			shape.HRef = filename;
			i++;
		}
	}
	return oleStreamDictionary;
}

Below the function ImportFilesToPdf(Aspose.Pdf.Document pdfFile, Dictionary<string, MemoryStream> embeddedFiles) :

private void ImportFilesToPdf(Aspose.Pdf.Document pdfFile, Dictionary<string, MemoryStream> embeddedFiles)
{
	Aspose.Pdf.PageCollection lPdfPages = pdfFile.Pages;
	foreach (Aspose.Pdf.Page lPdfPage in lPdfPages)
	{
		foreach (Aspose.Pdf.Annotations.LinkAnnotation la in lPdfPage.Annotations)
		{
			Aspose.Pdf.Annotations.GoToURIAction goToAction = (Aspose.Pdf.Annotations.GoToURIAction)la.Action;
			if (goToAction == null) continue;

			String fileName = goToAction.URI.Replace(@"%20", @" ");
			if (embeddedFiles.ContainsKey(fileName))
			{
				MemoryStream ms = embeddedFiles[fileName];
				ms.Position = 0;
				Aspose.Pdf.FileSpecification fs = new Aspose.Pdf.FileSpecification(ms, fileName);
				Aspose.Pdf.Annotations.FileAttachmentAnnotation faa = new Aspose.Pdf.Annotations.FileAttachmentAnnotation(lPdfPage, la.Rect, fs);
				faa.Icon = Aspose.Pdf.Annotations.FileIcon.Graph;
				faa.Opacity = 0.0;
				faa.Flags = Aspose.Pdf.Annotations.AnnotationFlags.ReadOnly;
				lPdfPage.Annotations.Add(faa);
				ms.Close();
			}
		}
	}
}

input.zip (732.4 KB)

@CSCT,

Kindly simplify the code and create a small console application, and then send us a Zip of this project. We will investigate your scenario and share our findings with you.

Hello,

thank you for your answer. I have simplified my code. Finaly I create an output docx file from a html file (the same input than previously : ‘input.zip’). In attachment the output file ‘word.docx’ where you will find that the internal links (represented by ‘–>’) don’t work. They are misinterpreted and are linked to the begin of the document instead of the number of page write after #.

Below my new code :

ZipFile zip = new ZipFile("input.zip");
foreach (ZipEntry zipEntry in zip)
{
	byte[] ret = null;
	Stream entryStream = zip.GetInputStream(zipEntry);
	ret = new byte[zipEntry.Size];
	entryStream.Read(ret, 0, ret.Length);

	if (!zipEntry.IsDirectory)
	{
		String fileName = zipEntry.Name;
		if (fileName.ToUpper().EndsWith(".HTML") || fileName.ToUpper().EndsWith(".HTM"))
		{
			MemoryStream memStream = new MemoryStream();
			memStream.Write(ret, 0, ret.Length);
			htmlFiles.Add(ZipEntry.CleanName(zipEntry.Name), memStream);
		}
		else
		{
			MemoryStream memStream = new MemoryStream();
			memStream.Write(ret, 0, ret.Length);
			otherFiles.Add(ZipEntry.CleanName(zipEntry.Name), memStream);
		}
	}
}

if (htmlFiles.Count > 0)
{
	foreach (string key in htmlFiles.Keys)
	{
		using (MemoryStream htmlDoc = htmlFiles[key])
		{
			htmlDoc.Position = 0;
			Aspose.Words.LoadOptions opt = new Aspose.Words.LoadOptions();
			Aspose.Words.DocumentBuilder builder = new Aspose.Words.DocumentBuilder();
			opt.LoadFormat = Aspose.Words.LoadFormat.Html;
			opt.ResourceLoadingCallback = new ImageLoader(otherFiles, builder);
			opt.Encoding = Encoding.UTF8;

			Aspose.Words.Document doc = new Aspose.Words.Document(htmlDoc, opt);
			builder.Document = doc;
			builder.PageSetup.Orientation = Orientation.Landscape;
			builder.PageSetup.PaperSize = PaperSize.A3;

			builder.Document.Range.Replace(ControlChar.NonBreakingSpace, " ", false, false);
			//Margin reducing
			foreach (Section sec in doc.Sections)
			{
				sec.PageSetup.TopMargin = 8;
				sec.PageSetup.BottomMargin = 8;
			}
			doc.Save("result_word.docx");
		}
	}
}

Here below my attachment ‘output.zip’ containing ‘word.docx’ :

output.zip (288.0 KB)

I hope it will be ok for you with these new entries to investigate on my issue.

Many thanks

Best regards

@CSCT,

This line of code is giving compilation error. Kindly share details about the ImageLoader class. If it is the part of Aspose.Words API, then kindly let us know which version of Aspose.Words for .NET API you are using. Your response is awaited.

Hello,

below the class ‘ImageLoader’ (simplified) that inherits from Word DLL ‘IResourceLoadingCallback’ class :

namespace CsctConverter.Utils
{
    public class ImageLoader : IResourceLoadingCallback
    {
        private Dictionary<String, MemoryStream> otherFiles;
        private DocumentBuilder builder;
        /// <summary>
        /// Initialize Image loader.
        /// </summary>
        /// <param name="dico"></param>
        /// <param name="aBuilder"></param>
        public ImageLoader(Dictionary<String, MemoryStream> dico, DocumentBuilder aBuilder)
        {
            otherFiles = dico;
            builder = aBuilder;
        }

        /// <summary>
        /// Initialize Resources loading.
        /// </summary>
        /// <param name="args"></param>
        /// <returns></returns>
        public ResourceLoadingAction ResourceLoading(ResourceLoadingArgs args)
        {
            if (args.ResourceType == ResourceType.Image)
            { 
                // Find the correct image in dico
                MemoryStream ms = null;
                foreach (string key in otherFiles.Keys)
                {
                    if (key.Equals(args.OriginalUri) || key.Equals(args.Uri) ) {
                        ms = otherFiles[key];
                    }
                }
                // If image not found, search in each folder
                if (ms == null)
                {
                    foreach (string key in otherFiles.Keys)
                    {
                        String fileName = Path.GetFileName(key);
                        if (fileName.Equals(args.OriginalUri) || fileName.Equals(args.Uri))
                        {
                            ms = otherFiles[key];
                        }
                    }
                }
                // If image not found, and begin by ./
                if (ms == null)
                {
                    foreach (string key in otherFiles.Keys)
                    {
                        if (("./" + key).Equals(args.OriginalUri) || ("./" + key).Equals(args.Uri))
                        {
                            ms = otherFiles[key];
                        }
                    }
                }
                if (ms != null)
                {


                    if (builder != null) // transform html to docx
                    {
                        // Image creation to get size
                        Image img = Image.FromStream(ms);
                        PageSetup ps = builder.PageSetup;

                        if (img.Width > ps.PageWidth || img.Height > ps.PageHeight)
                        {

                            MemoryStream msresized = new MemoryStream();
                            Image resized = ScaleImage(img, ps.PageWidth, ps.PageHeight);
                            System.Drawing.Imaging.ImageFormat current = System.Drawing.Imaging.ImageFormat.Jpeg;
                            if (args.OriginalUri.ToUpper().EndsWith(".PNG"))
                            {
                                current = System.Drawing.Imaging.ImageFormat.Png;
                            }
                            resized.Save(msresized, current);
                            args.SetData(msresized.ToArray());
                            msresized.Close();
                        }
                        else
                        {
                            args.SetData(ms.ToArray());
                        }
                    }
                    else // transform in pdf or pdfa
                    {
                        args.SetData(ms.ToArray());
                    }

                    return ResourceLoadingAction.UserProvided;
                    
                }
            }
            return ResourceLoadingAction.Skip;
        }

        /// <summary>
        /// Methode for resize Image.
        /// </summary>
        /// <param name="image"></param>
        /// <param name="maxWidth"></param>
        /// <param name="maxHeight"></param>
        /// <returns></returns>
        public static Image ScaleImage(Image image, double maxWidth, double maxHeight)
        {
            var ratioX = maxWidth / image.Width;
            var ratioY = maxHeight / image.Height;
            var ratio = Math.Min(ratioX, ratioY);

            var newWidth = (int)(image.Width * ratio);
            var newHeight = (int)(image.Height * ratio);

            var newImage = new Bitmap(newWidth, newHeight);
            Graphics.FromImage(newImage).DrawImage(image, 0, 0, newWidth, newHeight);
            Image bmp = new Bitmap(newImage);

            return bmp;
        }
    }
} 

Regards

@CSCT,

We are working over your query and will get back to you soon.

@CSCT,

Thanks for your patience. We have tested the scenario and have managed to reproduce the same issue at our side. For the sake of correction, we have logged this problem in our issue tracking system as WORDSNET-16568. You will be notified via this forum thread once this issue is resolved.

We apologize for your inconvenience.

The issues you have found earlier (filed as WORDSNET-16568) have been fixed in this Aspose.Words for .NET 23.2 update also available on NuGet.