pdf中文字含有内嵌字体,导致使用TextFragmentAbsorber匹配不到正确的文本,如何处理。f94a6f84-ecb5-4254-8d6a-d04e0846a377.pdf (222.1 KB)
文档中的手机号和邮箱都无法用正常的正则匹配到。
public static final String PHONE_REG = "(?:(?:1[-\\s]*[3456789][-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1}[-\\s]*\\d{1})|(?:0[1-9]\\d{1,2}[-\\s]*\\d{7,8}))(?!\\d)";
public static void main(String[] args) throws Exception {
byte[] source = FileUtils.readFileToByteArray(new File("/Users/Carol/Desktop/f94a6f84-ecb5-4254-8d6a-d04e0846a377.pdf"));
if (!getLicense()) {
throw new Exception("com.aspose.pdf lic ERROR!");
}
try (ByteArrayInputStream searchInputStream = new ByteArrayInputStream(source); ByteArrayOutputStream outputStream = new ByteArrayOutputStream()) {
Document pdfDoc = new Document(searchInputStream);
TextSearchOptions textSearchOptions = new TextSearchOptions(true);
TextEditOptions textEditOptions = new TextEditOptions(0, TextEditOptions.LanguageTransformation.class);
TextFragmentAbsorber phoneTextFragmentAbsorber = new TextFragmentAbsorber(
PHONE_REG,
textSearchOptions,
textEditOptions);
PageCollection pages = pdfDoc.getPages();
Page page = pages.get_Item(1);
page.accept(phoneTextFragmentAbsorber);
logger.info("[开始搜索]");
for (TextFragment textFragment : phoneTextFragmentAbsorber.getTextFragments()) {
String text = textFragment.getText();
logger.info("手机: " + text);
}
} catch (Exception e) {
e.printStackTrace();
}
}