About Aspose.words for Java doc文档内有docx的附件-docx附件解析后乱码
测试2.doc.zip (17.5 KB)
主代码:
//创建word读取源
com.odm.word.reader.resource.Resource resource = new FileResource(file);
// 排序生成器
AtomicInteger displayIndex = new AtomicInteger(0);
// 全局缓存大纲uuid与大纲id关系
OdmWordDocOutline parentOutLine = new OdmWordDocOutline();
//起始章节号
Integer startLevel = null;
//获取文档读句柄
ThreadsWordReader wordReader = this.getThreadsWordReader(doc);
//设置分割大纲参数
wordReader.setParseParam(ParamKeys.SPLIT_WORD_OUTLINE_COUNT,
odmWordDocService.getBySplitWordParamByCount(odmProperties.getSplitWordOutlineCode()));
//设置分割大纲参数
wordReader.setParseParam(ParamKeys.SPLIT_WORD_MAX_THREAD_COUNT,
odmWordDocService.getBySplitWordParamByCount(odmProperties.getSplitWordMaxThreadCountCode()));
//文档读句柄绑定封面、大纲、大纲内容处理器
wordReaderWithbindDataCollectByThreads(wordReader, doc, displayIndex, parentOutLine, startLevel, false);
//文档读句柄绑定解析日志推送
wordReaderWithPushProcessMsg(wordReader, client, null);
//设置读取源
wordReader.setResource(resource);
//开始读取
wordReader.reader();
//更新文档为已完成
odmWordDocService.updateExtractStatus(docId, ExtractStatusEnum.EXTRACSTATUS_YES.getValue());
分代码一:
/**
* 获取多线程解析器
*
* @param doc 文档对象
* @return {@link WordReader}
* @author huangyong
* @date 2020/12/24 15:54
* @since 8.3.0
*/
private ThreadsWordReader getThreadsWordReader(OdmWordDoc doc) {
return new ThreadsFullWordReader(odmWordTagService.getUploadDocDir(doc),
odmWordTagService.getUploadDocDir(doc));
}
分代码二:
//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by FernFlower decompiler)
//
package com.odm.word.reader;
import com.aspose.words.Bookmark;
import com.aspose.words.BookmarkCollection;
import com.aspose.words.BookmarkEnd;
import com.aspose.words.Document;
import com.aspose.words.Field;
import com.aspose.words.FieldCollection;
import com.aspose.words.FieldHyperlink;
import com.aspose.words.Node;
import com.aspose.words.NodeCollection;
import com.aspose.words.Paragraph;
import com.aspose.words.StructuredDocumentTag;
import com.odm.word.WordRead;
import com.odm.word.exception.WordReaderException;
import com.odm.word.model.DocOutLine;
import com.odm.word.model.DocOutLineContent;
import com.odm.word.model.DocTag;
import com.odm.word.model.DocumentWrap;
import com.odm.word.reader.data.CollectType;
import com.odm.word.reader.data.DataCollect;
import com.odm.word.reader.data.TagUuidOnceDataCollect;
import com.odm.word.reader.listener.WordThreadParsedCompleteListener;
import com.odm.word.utils.ObjectUtil;
import com.odm.word.utils.StringUtil;
import com.odm.word.utils.TimerUtil;
import com.odm.word.utils.UUIDUtil;
import java.math.BigDecimal;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.Callable;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;
public class ThreadsFullWordReader extends AbstractFullWordReader {
private ThreadLocal<DocOutLine> runBindBeforeOutline = new ThreadLocal();
private AtomicInteger processedOutlineCount = new AtomicInteger(0);
private static int MAX_SPLIT_OUTLINE_COUNT = 25;
private static int MAX_THREAD_COUNT = 4;
private Paragraph docLastParagraph;
private int paragraphsCount;
public ThreadsFullWordReader(String imageSaveDir, String oleSaveDir) {
this.setImageSaveDir(imageSaveDir);
this.setOleSaveDir(oleSaveDir);
}
public void doReader(DocumentWrap documentWrap, WordRead wordRead) throws WordReaderException {
Long startTime = System.currentTimeMillis();
this.runLog("word解析开始---->");
try {
this.initParams(documentWrap);
wordRead.updateList(documentWrap);
Set<String> linkSet = this.getFieldHyperlink(documentWrap.getDocument());
NodeCollection<Paragraph> paragraphList = documentWrap.getDocument().getChildNodes(8, true);
this.setParagraphsCount(paragraphList.getCount());
List<DocOutLine> outlineParagraphs = this.convertByOutlineParagraph(paragraphList, linkSet);
this.setMapTag(wordRead.readWKDataInXmlPart(documentWrap));
DataCollect dataCollect = this.detectSupportCollectData(CollectType.TAG_UUID);
if (dataCollect instanceof TagUuidOnceDataCollect) {
dataCollect.collect(this.getMapTags(documentWrap), new Object[0]);
}
int recordOutlineCount = outlineParagraphs.size();
List<Callable<Integer>> splitWordTasks = new ArrayList();
if (recordOutlineCount == 0) {
splitWordTasks.add(new ThreadsFullWordReader.SplitWordTaskByCover(documentWrap, paragraphList));
} else if (recordOutlineCount < MAX_SPLIT_OUTLINE_COUNT) {
splitWordTasks.add(new ThreadsFullWordReader.SplitWordTaskByCover(documentWrap, paragraphList));
splitWordTasks.add(new ThreadsFullWordReader.SplitWordTaskByOutline(documentWrap, outlineParagraphs, 0));
} else {
splitWordTasks.add(new ThreadsFullWordReader.SplitWordTaskByCover(documentWrap, paragraphList));
int i = 0;
for(int len = recordOutlineCount / MAX_SPLIT_OUTLINE_COUNT; i <= len; ++i) {
int startIndex = i * MAX_SPLIT_OUTLINE_COUNT;
if (startIndex < recordOutlineCount) {
splitWordTasks.add(new ThreadsFullWordReader.SplitWordTaskByOutline(documentWrap, outlineParagraphs, startIndex));
}
}
}
ExecutorService executorService = Executors.newFixedThreadPool(this.getThreads(splitWordTasks));
List<Future<Integer>> results = executorService.invokeAll(splitWordTasks);
executorService.shutdown();
Iterator var22 = results.iterator();
while(var22.hasNext()) {
Future<Integer> result = (Future)var22.next();
result.get();
}
Queue<WordThreadParsedCompleteListener> wordThreadParsedCompleteListeners = this.getWordThreadParsedCompleteListeners();
Iterator var24 = wordThreadParsedCompleteListeners.iterator();
while(true) {
if (!var24.hasNext()) {
if (this.getProcessPercentRule().isRunProcess(100)) {
this.runProcessPercent(100);
}
break;
}
WordThreadParsedCompleteListener wordThreadParsedCompleteListener = (WordThreadParsedCompleteListener)var24.next();
wordThreadParsedCompleteListener.onParsed(documentWrap, wordRead);
}
} catch (Exception var18) {
throw new WordReaderException(var18);
} finally {
this.runLog("word解析结束 耗时=" + TimerUtil.getCostTime(startTime));
}
}
private void initParams(DocumentWrap documentWrap) {
String splitWordOutlineCount = (String)documentWrap.getParseParam("odm_split_word_outline_count");
if (StringUtil.isNumeric(splitWordOutlineCount)) {
int splitOutlineCount = Integer.parseInt(splitWordOutlineCount);
if (splitOutlineCount > 0 && splitOutlineCount < 2147483647) {
MAX_SPLIT_OUTLINE_COUNT = splitOutlineCount;
}
}
String odmSplitWordMaxThreadCount = (String)documentWrap.getParseParam("odm_split_word_max_thread_count");
if (StringUtil.isNumeric(odmSplitWordMaxThreadCount)) {
int mxThreadCount = Integer.parseInt(odmSplitWordMaxThreadCount);
if (mxThreadCount > 0 && mxThreadCount < Runtime.getRuntime().availableProcessors()) {
MAX_THREAD_COUNT = mxThreadCount;
}
}
}
public Paragraph getDocLastParagraph() {
return this.docLastParagraph;
}
public void setDocLastParagraph(Paragraph docLastParagraph) {
this.docLastParagraph = docLastParagraph;
}
public int getParagraphsCount() {
return this.paragraphsCount;
}
public void setParagraphsCount(int paragraphsCount) {
this.paragraphsCount = paragraphsCount;
}
private List<DocOutLine> convertByOutlineParagraph(NodeCollection<Paragraph> paragraphList, Set<String> linkSet) {
List<DocOutLine> outlineParagraphs = new LinkedList();
DocOutLine beforeDocOutLine = null;
int dispIndx = 1;
int i = 0;
for(int len = paragraphList.getCount(); i < len; ++i) {
Paragraph paragraph = (Paragraph)paragraphList.get(i);
boolean isOutLine = false;
if (paragraph != null) {
isOutLine = WordRead.isOutLine(paragraph);
}
if (isOutLine) {
this.removeComment(paragraph);
DocOutLine docOutLine = new DocOutLine();
docOutLine.setLinkBookmark(this.getBookmark(paragraph, linkSet)).setOutLevel(this.getOutLevel(paragraph)).setDispIndx(dispIndx).setChapterNo(paragraph.getListLabel().getLabelString()).setText(StringUtil.formatWord(paragraph.getText())).setStyleName(this.getWordRead().filterHanderTooLongStyleName(paragraph)).setParagraph(paragraph);
if (ObjectUtil.isNotNull(beforeDocOutLine)) {
docOutLine.setParent(this.getParentDocLine(docOutLine, beforeDocOutLine));
}
outlineParagraphs.add(docOutLine);
beforeDocOutLine = docOutLine;
++dispIndx;
}
if (paragraph == null) {
paragraph = (Paragraph)paragraphList.get(i - 1);
break;
}
this.setDocLastParagraph(paragraph);
}
return outlineParagraphs;
}
private int getThreads(List<Callable<Integer>> splitWordTasks) {
int threads = false;
int threads;
if (splitWordTasks.size() == 1) {
threads = 1;
} else {
threads = (int)Math.ceil((double)splitWordTasks.size() / 2.0D);
}
if (threads > MAX_THREAD_COUNT) {
threads = MAX_THREAD_COUNT;
}
return threads;
}
protected String getBookmark(Paragraph paragraph, Set linkSet) {
BookmarkCollection bookmarks = paragraph.getRange().getBookmarks();
String bookmark = "";
if (bookmarks.getCount() <= 0) {
bookmark = this.bookmark(paragraph, bookmark, linkSet);
} else {
Iterator var5 = bookmarks.iterator();
while(var5.hasNext()) {
Bookmark bookmark1 = (Bookmark)var5.next();
if (linkSet.contains(bookmark1.getName())) {
bookmark = bookmark1.getName() + "," + bookmark;
}
}
}
if (StringUtil.isNotEmpty(bookmark)) {
bookmark = bookmark.substring(0, bookmark.length() - 1);
}
return bookmark;
}
protected Set<String> getFieldHyperlink(Document document) {
FieldCollection fields = document.getRange().getFields();
Set<String> linkSet = new HashSet();
Iterator var4 = fields.iterator();
while(var4.hasNext()) {
Field field = (Field)var4.next();
if (field instanceof FieldHyperlink) {
FieldHyperlink fieldHyperlink = (FieldHyperlink)field;
System.out.println(fieldHyperlink.getAddress());
if (fieldHyperlink.getSubAddress() != null && fieldHyperlink.getSubAddress().indexOf("_Toc") == -1) {
linkSet.add(fieldHyperlink.getSubAddress());
}
}
}
return linkSet;
}
private String bookmark(Node node, String bookmark, Set linkSet) {
Node previousNode = node.getPreviousSibling();
if (previousNode != null && previousNode.getNodeType() == 10) {
String bookName = ((BookmarkEnd)previousNode).getName();
if (StringUtil.isNotEmpty(bookName) && linkSet.contains(bookName)) {
bookmark = bookmark + bookName + ",";
}
return this.bookmark(previousNode, bookmark, linkSet);
} else {
return bookmark;
}
}
class SplitWordTaskByOutline implements Callable<Integer> {
private DocumentWrap documentWrap;
private List<DocOutLine> outlineParagraphs;
private int startIndex;
public SplitWordTaskByOutline(DocumentWrap documentWrap, List<DocOutLine> outlineParagraphs, int startIndex) {
this.documentWrap = documentWrap;
this.outlineParagraphs = outlineParagraphs;
this.startIndex = startIndex;
}
public Integer call() {
int recordOutlineCount = this.outlineParagraphs.size();
int i = this.startIndex;
for(int len = this.startIndex + ThreadsFullWordReader.MAX_SPLIT_OUTLINE_COUNT < recordOutlineCount ? this.startIndex + ThreadsFullWordReader.MAX_SPLIT_OUTLINE_COUNT : recordOutlineCount; i < len; ++i) {
DocOutLine docOutLine = (DocOutLine)this.outlineParagraphs.get(i);
StructuredDocumentTag documentTag = WordRead.readTagBoxByParagraphOutline(docOutLine.getParagraph());
if (ObjectUtil.isNotNull(documentTag)) {
docOutLine.setDocTag((DocTag)ThreadsFullWordReader.this.getMapTag().get(documentTag.getTag()));
docOutLine.setUuid(documentTag.getTag());
}
ThreadsFullWordReader.this.detectSupportCollectData(CollectType.OUTLINE).collect(docOutLine, new Object[0]);
ThreadsFullWordReader.this.runBindBeforeOutline.set(docOutLine);
ThreadsFullWordReader.this.runLog("[" + docOutLine.getChapterNo() + " " + docOutLine.getText() + "] 大纲解析成功---->");
DocOutLineContent docOutLineContent = new DocOutLineContent();
docOutLineContent.setDocOutLine(docOutLine);
docOutLineContent.setUuid(UUIDUtil.uuid());
int nextIndex = i + 1;
if (nextIndex >= recordOutlineCount) {
if (docOutLine.getParagraph() != ThreadsFullWordReader.this.getDocLastParagraph()) {
ThreadsFullWordReader.this.handleOutLineContent(this.documentWrap, docOutLineContent, ThreadsFullWordReader.this.getDocLastParagraph(), true);
}
} else {
ThreadsFullWordReader.this.handleOutLineContent(this.documentWrap, docOutLineContent, ((DocOutLine)this.outlineParagraphs.get(nextIndex)).getParagraph(), false);
}
int curOutlineCount = ThreadsFullWordReader.this.processedOutlineCount.incrementAndGet();
int percent = (int)((new BigDecimal((double)((float)curOutlineCount / (float)recordOutlineCount))).setScale(2, 1).doubleValue() * 100.0D);
if (percent != 100 && ThreadsFullWordReader.this.getProcessPercentRule().isRunProcess(percent)) {
ThreadsFullWordReader.this.runProcessPercent(percent);
}
}
return 1;
}
}
class SplitWordTaskByCover implements Callable<Integer> {
private DocumentWrap documentWrap;
private NodeCollection<Paragraph> paragraphList;
public SplitWordTaskByCover(DocumentWrap documentWrap, NodeCollection<Paragraph> paragraphList) {
this.documentWrap = documentWrap;
this.paragraphList = paragraphList;
}
public Integer call() {
ThreadsFullWordReader.this.handlerCoverDataReturnNextNodePosition(this.documentWrap, this.paragraphList);
return 1;
}
}
}
分代码三:
//
// Source code recreated from a .class file by IntelliJ IDEA
// (powered by FernFlower decompiler)
//
package com.odm.word.reader;
import com.aspose.words.*;
import com.odm.word.HtmlSaveOptionsBuilder;
import com.odm.word.WordContentExtractor;
import com.odm.word.WordHtmlWebEnhance;
import com.odm.word.WordRead;
import com.odm.word.exception.WordReaderException;
import com.odm.word.model.*;
import com.odm.word.reader.data.CollectType;
import com.odm.word.reader.process.ProcessPercentRule;
import com.odm.word.utils.ListUtil;
import com.odm.word.utils.ObjectUtil;
import com.odm.word.utils.StringUtil;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.nodes.Document;
import java.math.BigDecimal;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
public abstract class AbstractFullWordReader extends AbstractSupportOleWordReader {
private Map<String, DocTag> mapTag;
private ProcessPercentRule processPercentRule = new ProcessPercentRule();
public AbstractFullWordReader() {
}
protected void handleOutLineContent(DocumentWrap documentWrap, DocOutLineContent docOutLineContent, Paragraph endParagraph, boolean docEnd) {
try {
if (!ObjectUtil.isNull(docOutLineContent)) {
Paragraph startParagraph = docOutLineContent.getDocOutLine().getParagraph();
int pageOrientation = Math.max(startParagraph.getParentSection().getPageSetup().getOrientation(), endParagraph.getParentSection().getPageSetup().getOrientation());
ArrayList extractedNodes = WordContentExtractor.extractContent(startParagraph, endParagraph, false, docEnd);
DocumentWrap dstDoc = WordContentExtractor.generateDocument(documentWrap, extractedNodes);
BuiltInDocumentProperties properties = dstDoc.getDocument().getBuiltInDocumentProperties();
dstDoc.getDocument().updateWordCount();
docOutLineContent.setStatWordsCount(properties.getWords());
docOutLineContent.setComments(this.extractComments(dstDoc));
FieldCollection fields = dstDoc.getDocument().getRange().getFields();
Iterator var11 = fields.iterator();
while(true) {
while(var11.hasNext()) {
Field field = (Field)var11.next();
if (field instanceof FieldToc) {
FieldToc fieldToc = (FieldToc)field;
fieldToc.getStart().getParentParagraph().remove();
} else if (field instanceof FieldHyperlink) {
FieldHyperlink fieldHyperlink = (FieldHyperlink)field;
if (fieldHyperlink.getSubAddress() != null && fieldHyperlink.getSubAddress().indexOf("_Toc") > -1) {
fieldHyperlink.remove();
} else if (fieldHyperlink.getSubAddress() != null && fieldHyperlink.getSubAddress().indexOf("_Toc") <= -1) {
docOutLineContent.setLinkName(fieldHyperlink.getSubAddress());
} else {
fieldHyperlink.unlink();
}
}
}
List<DocReadFile> docReadFiles = new ArrayList();
HtmlSaveOptions htmlSaveOptions = HtmlSaveOptionsBuilder.build(documentWrap, this.getImageSaveDir(), this.getOleSaveDir(), docReadFiles);
Document doc = WordHtmlWebEnhance.enhanceWeb(dstDoc.getDocument().toString(htmlSaveOptions), docReadFiles);
docOutLineContent.setHtml(doc.body().html());
docOutLineContent.setPageOrientation(pageOrientation);
this.detectSupportCollectData(CollectType.OUTLINE_CONTENT).collect(docOutLineContent, new Object[]{docReadFiles});
this.runLog("大纲[" + docOutLineContent.getDocOutLine().getChapterNo() + " " + docOutLineContent.getDocOutLine().getText() + "] 内容解析成功---->");
return;
}
}
} catch (Exception var14) {
this.runErrorLog("大纲内容处理异常", var14);
throw new WordReaderException("大纲内容处理异常", var14);
}
}
protected int handlerCoverDataReturnNextNodePosition(DocumentWrap documentWrap, NodeCollection<Paragraph> paragraphs) throws WordReaderException {
try {
ParagraphStoreList paragraphStoreList = this.readFirstOutlineBeforeParagraphsForDocument(paragraphs);
if (ListUtil.isNullOrEmpty(paragraphStoreList.getParagraphList())) {
return paragraphStoreList.getLastParaPostion();
} else {
int coverSumCount = paragraphStoreList.getParagraphList().size();
Paragraph startNode = (Paragraph)paragraphStoreList.getParagraphList().get(0);
Paragraph endNode = (Paragraph)paragraphStoreList.getParagraphList().get(coverSumCount - 1);
DocCover docCover = new DocCover();
ArrayList nodeList = WordContentExtractor.extractContent(startNode, endNode, true, true);
DocumentWrap dstDoc = WordContentExtractor.generateDocument(documentWrap, nodeList);
NodeCollection<Paragraph> childNodes = dstDoc.getDocument().getChildNodes(8, true);
Iterator<Paragraph> paragraphIterator = childNodes.iterator();
boolean removedTocName = false;
Node tocNameNode = null;
while(paragraphIterator.hasNext()) {
Paragraph paragraph = (Paragraph)paragraphIterator.next();
if (paragraph.getText().indexOf("_Toc") != -1) {
if (!removedTocName) {
tocNameNode = paragraph.getPreviousSibling();
removedTocName = true;
}
paragraph.remove();
}
}
int times = 5;
do {
if (tocNameNode != null && tocNameNode instanceof Paragraph && !((Paragraph)tocNameNode).isInCell()) {
String nodeText = StringUtils.deleteWhitespace(tocNameNode.getText());
if ("目录".equals(nodeText)) {
tocNameNode.remove();
break;
}
Node childToc = ((Paragraph)tocNameNode).getFirstChild();
if (childToc != null) {
String nodeChildText = StringUtils.deleteWhitespace(childToc.getText());
if ("目录".equals(nodeChildText)) {
tocNameNode.remove();
break;
}
}
tocNameNode = tocNameNode.getPreviousSibling();
}
--times;
} while(times > 0);
com.aspose.words.Document document = dstDoc.getDocument();
docCover.setComments(this.extractComments(document));
NodeCollection<StructuredDocumentTag> documentTags = document.getChildNodes(28, true);
if (ObjectUtil.isNotNull(documentTags) && documentTags.getCount() > 0) {
docCover.setUuid(((StructuredDocumentTag)documentTags.get(0)).getTag());
docCover.setDocTag((DocTag)this.getMapTag().get(docCover.getUuid()));
}
BuiltInDocumentProperties properties = dstDoc.getDocument().getBuiltInDocumentProperties();
dstDoc.getDocument().updateWordCount();
docCover.setStatWordsCount(properties.getWords());
List<DocReadFile> docReadFiles = new ArrayList();
HtmlSaveOptions htmlSaveOptions = HtmlSaveOptionsBuilder.build(documentWrap, this.getImageSaveDir(), this.getOleSaveDir(), docReadFiles);
Document doc = WordHtmlWebEnhance.enhanceWeb(dstDoc.getDocument().toString(htmlSaveOptions), docReadFiles);
docCover.setHtml(doc.body().html());
this.detectSupportCollectData(CollectType.COVER).collect(docCover, new Object[]{docReadFiles});
this.runLog("封面解析成功---->");
return paragraphStoreList.getLastParaPostion();
}
} catch (Exception var21) {
this.runErrorLog("解析文档封面失败", var21);
throw new WordReaderException("解析文档封面失败", var21);
}
}
private ParagraphStoreList readFirstOutlineBeforeParagraphsForDocument(NodeCollection<Paragraph> paragraphs) {
ParagraphStoreList paragraphStoreList = new ParagraphStoreList(0);
int rCount = 0;
for(int pCount = paragraphs.getCount(); rCount < pCount; ++rCount) {
paragraphStoreList.setLastParaPostion(rCount);
Paragraph paragraph = (Paragraph)paragraphs.get(rCount);
if (WordRead.isOutLine(paragraph) && StringUtil.isNotEmpty(StringUtil.formatWord(paragraph.getText()))) {
return paragraphStoreList;
}
if (ObjectUtil.isNotNull(paragraph.getAncestor(3))) {
paragraphStoreList.addParagraph(paragraph);
}
int percent = (int)((new BigDecimal((double)((float)(rCount + 1) / (float)pCount))).setScale(2, 1).doubleValue() * 100.0D);
if (this.getProcessPercentRule().isRunProcess(percent)) {
this.runProcessPercent(percent);
}
}
return paragraphStoreList;
}
public Map<String, DocTag> getMapTag() {
return this.mapTag;
}
public void setMapTag(Map<String, DocTag> mapTag) {
this.mapTag = mapTag;
}
public ProcessPercentRule getProcessPercentRule() {
return this.processPercentRule;
}
}
原doc文档带了一个docx的附件:
image.png (97.0 KB)
aspose解析后,附件乱码:
image.png (223.8 KB)
c722eca4a08349918ca7291fcd6072d6.docx (12.5 KB)
跟踪代码主要是这段代码调用aspose组件解析时,会把原doc文档中的附件解析出来,但解析出来的结果是乱码:
List<DocReadFile> docReadFiles = new ArrayList();
HtmlSaveOptions htmlSaveOptions = HtmlSaveOptionsBuilder.build(documentWrap, this.getImageSaveDir(), this.getOleSaveDir(), docReadFiles);
Document doc = WordHtmlWebEnhance.enhanceWeb(dstDoc.getDocument().toString(htmlSaveOptions), docReadFiles);
原doc文档和原docx附件:
归档.zip (26.8 KB)
@vpsoft, 您似乎发布了不相关的代码。 为了保存嵌入的 Word 文档,您很可能使用与此类似的代码:https://docs.aspose.com/words/java/working-with-ole-objects/#get-access-to-ole-object-raw。
您能找到并发布保存嵌入式 DOCX 文件的确切代码片段吗??
因为aspose的包是混淆过,跟踪到的方法名都是不可读
package com.odm.examples;
import com.odm.word.constants.ParamKeys;
import com.odm.word.model.DocCover;
import com.odm.word.model.DocOutLine;
import com.odm.word.model.DocOutLineContent;
import com.odm.word.model.DocReadFile;
import com.odm.word.reader.DefaultFullWordReader;
import com.odm.word.reader.ThreadsFullWordReader;
import com.odm.word.reader.WordReader;
import com.odm.word.reader.data.*;
import com.odm.word.reader.log.ConsoleWordLogListener;
import com.odm.word.reader.resource.InputStreamResource;
import com.odm.word.reader.resource.Resource;
import com.odm.word.utils.WordPropsUtil;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.util.List;
import java.util.Map;
import java.util.Random;
/**
* word 读测试
*
* <pre class="code">
*
* //文件流资源
* File file = new File("d:\\测试.docx");
* Resource resource = new InputStreamResource(new FileInputStream(file));
*
* WordReader wordReader = new DefaultFullWordReader("d:\\upload", "d:\\upload");
* //绑定tag uuid 数据收集
* wordReader.bindDataCollect(new TagUuidOnceDataCollect(){
* <code>@Override</code>
* public void doCollect(Map<String, Integer> tagMap) {
* for (Map.Entry<String, Integer> entry : tagMap.entrySet()) {
* System.out.println(entry.getKey()+"=="+entry.getValue());
* }
* }
* });
* //绑定封面数据收集
* wordReader.bindDataCollect(new CoverDataCollect(){
* <code>@Override</code>
* public void doCollect(DocCover docCover, List<DocReadFile> docReadFiles) {
* System.out.println(docCover.getHtml());
* }
* });
* //绑定大纲数据收集
* wordReader.bindDataCollect(new OutlineDataCollect() {
* <code>@Override</code>
* public void doCollect(DocOutLine docOutLine) {
* //大纲设置业务ID
* docOutLine.setBusId(new Random().nextInt());
* System.out.println(docOutLine.toString());
* }
* });
* //绑定大纲内容数据收集
* wordReader.bindDataCollect(new OutlineContentDataCollect() {
* <code>@Override</code>
* public void doCollect(DocOutLineContent docOutLineContent, List<DocReadFile> docReadFiles) {
* //大纲内容也能获取大纲ID 做大纲与内容ID 关联
* System.out.println(docOutLineContent.getHtml());
* }
* });
*
* //绑定日志输出
* wordReader.bindLog(new ConsoleWordLog());
* //绑定读资源
* wordReader.setResource(resource);
* //开始读
* wordReader.reader();
* </pre>
*
*
* @author huangyong
* @date 2018/5/11 20:31
* @since v1.0.0
*/
public class WordReadDemo {
public static void main(String[] args) throws FileNotFoundException {
File file = new File("/Users/alex/Documents/7047002009530007728/测试2.doc");
WordPropsUtil.setProperty(ParamKeys.WORD_IMAGE_RESOLUTION, "100");
Resource resource = new InputStreamResource(new FileInputStream(file));
DefaultFullWordReader wordReader = new DefaultFullWordReader("/Users/alex/Documents/7047002009530007728", "/Users/alex/Documents/7047002009530007728");
wordReader.bindDocProp(new DocPropDataCollect() {
@Override
public void doCollect(Map<String, Object> docProps) {
System.out.println(docProps);
}
});
wordReader.bindDataCollect(new TagUuidOnceDataCollect(){
@Override
public void doCollect(Map<String, Integer> tagMap) {
for (Map.Entry<String, Integer> entry : tagMap.entrySet()) {
System.out.println(entry.getKey()+"=="+entry.getValue());
}
}
});
wordReader.bindDataCollect(new CoverDataCollect(){
@Override
public void doCollect(DocCover docCover, List<DocReadFile> docReadFiles) {
String text = docCover.getHtml().replaceAll("> <","><");
System.out.println(docCover.getHtml());
}
});
wordReader.bindDataCollect(new OutlineDataCollect() {
@Override
public void doCollect(DocOutLine docOutLine) {
//大纲设置业务ID
docOutLine.setBusId(new Random().nextInt());
System.out.println("linkbookNameoutline++++++++++++++++++++++++++"+docOutLine.getLinkBookmark());
System.out.println(docOutLine.toString());
}
});
wordReader.bindDataCollect(new OutlineContentDataCollect() {
@Override
public void doCollect(DocOutLineContent docOutLineContent, List<DocReadFile> docReadFiles) {
//大纲内容也能获取大纲ID 做大纲与内容ID 关联
String text = docOutLineContent.getHtml().replaceAll("> <","><");
System.out.println("linkbookNameoutline++++++++++++++++++++++++++11111111111"+text);
}
});
wordReader.addLogListener(new ConsoleWordLogListener());
wordReader.setResource(resource);
wordReader.reader();
}
}
测试2.doc.zip (17.5 KB)
@vpsoft, 这是可以从 DOC 中提取 DOCX 的最小代码:
Document doc = new Document("测试2.doc");
Shape shape = (Shape)doc.getChild(NodeType.SHAPE, 0, true);
shape.getOleFormat().save("embedded.docx");
您可以通过搜索“oleFormat.save”字符串在您的源代码文件中找到它。
我能够重现这个问题。 似乎只有在 WPS 办公室中创建 DOC 文件时,提取的 DOCX 问题才会发生。 如果 DOC 文件是在 Microsoft Word 中创建的,则该问题不可重现:提取的 DOCX 可以在 WPS Office 的 Microsoft Word 中打开,没有任何问题。
虽然 Aspose.Words 的目标是提供与 Microsoft Word 的最佳兼容性,但我已经为您的问题打开了 WORDSNET-25257 票证。 修复后,您将在此线程中收到通知。
收到,因为我们产品对此bug的处理要求比较急,请老师帮忙加急处理一下,万分感谢!
The issues you have found earlier (filed as WORDSNET-25257) have been fixed in this Aspose.Words for Java 23.7 update.
您好,经验wps 中后缀为doc的文档中插入.docx 附件解析后乱码已解决。但是附件中含有.xlxs 的附件依然乱码?能帮忙处理一下吗?
WORDSNET-25257 ,您好,经验wps 中后缀为doc的文档中插入.docx 附件解析后乱码已解决。但是附件中含有.xlxs 的附件依然乱码?能帮忙处理一下吗?处理好后,我们将会购买服务。
您好:
附件为包含嵌入的 .xlxs 的示例文档,请查收!
(Attachment 需求一体化管理系统建设项目需求书.rar is missing)
The issues you have found earlier (filed as WORDSNET-25730) have been fixed in this Aspose.Words for Java 23.9 update.