Skip to content

Commit 61d1063

Browse files
committed
Fix issues in text selection
- PR mozilla#13257 fixed a lot of issues but not all and this patch aims to fix almost all remaining issues. - the idea in this new patch is to compare position of new glyph with the last position where a glyph has been drawn; - no space are "drawn": it just moves the cursor but they aren't added in the chunk; - so this way a space followed by a cursor move can be treated as only one space: it helps to merge all spaces into one. - to make difference between real spaces and tracking ones, we used a factor of the space width (from the font) - it was a pretty good idea in general but it fails with some fonts where space was too big: - in Poppler, they're using a factor of the font size: this is an excellent idea (<= 0.1 * fontSize implies tracking space).
1 parent f5b79be commit 61d1063

File tree

10 files changed

+274
-142
lines changed

10 files changed

+274
-142
lines changed

src/core/evaluator.js

Lines changed: 154 additions & 138 deletions
Large diffs are not rendered by default.

src/display/text_layer.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -188,7 +188,7 @@ function appendText(task, geom, styles, ctx) {
188188
(task._enhanceTextSelection && AllWhitespaceRegexp.test(geom.str))
189189
) {
190190
shouldScaleText = true;
191-
} else if (geom.transform[0] !== geom.transform[3]) {
191+
} else if (geom.str !== " " && geom.transform[0] !== geom.transform[3]) {
192192
const absScaleX = Math.abs(geom.transform[0]),
193193
absScaleY = Math.abs(geom.transform[3]);
194194
// When the horizontal/vertical scaling differs significantly, also scale

test/pdfs/.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
!issue1155r.pdf
1414
!issue2017r.pdf
1515
!bug1727053.pdf
16+
!issue11913.pdf
1617
!issue2391-1.pdf
1718
!issue2391-2.pdf
1819
!issue14046.pdf
@@ -182,6 +183,7 @@
182183
!issue11931.pdf
183184
!issue1655r.pdf
184185
!issue6541.pdf
186+
!issue10640.pdf
185187
!issue2948.pdf
186188
!issue6231_1.pdf
187189
!issue10402.pdf
@@ -285,6 +287,7 @@
285287
!issue2840.pdf
286288
!issue4061.pdf
287289
!issue4668.pdf
290+
!issue13226.pdf
288291
!PDFJS-7562-reduced.pdf
289292
!issue11768_reduced.pdf
290293
!issue5039.pdf
@@ -440,6 +443,7 @@
440443
!annotation-fileattachment.pdf
441444
!annotation-text-widget.pdf
442445
!annotation-choice-widget.pdf
446+
!issue10900.pdf
443447
!annotation-button-widget.pdf
444448
!annotation-polyline-polygon.pdf
445449
!annotation-polyline-polygon-without-appearance.pdf
@@ -462,6 +466,7 @@
462466
!issue9972-3.pdf
463467
!tiling-pattern-box.pdf
464468
!tiling-pattern-large-steps.pdf
469+
!issue13201.pdf
465470
!issue11555.pdf
466471
!issue12337.pdf
467472
!pr12564.pdf

test/pdfs/issue10640.pdf

49.1 KB
Binary file not shown.

test/pdfs/issue10900.pdf

33.3 KB
Binary file not shown.

test/pdfs/issue11913.pdf

32.8 KB
Binary file not shown.

test/pdfs/issue13201.pdf

146 KB
Binary file not shown.

test/pdfs/issue13226.pdf

972 Bytes
Binary file not shown.

test/unit/api_spec.js

Lines changed: 113 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,10 @@ describe("api", function () {
7373
}, WAIT_TIMEOUT);
7474
}
7575

76+
function mergeText(items) {
77+
return items.map(chunk => chunk.str + (chunk.hasEOL ? "\n" : "")).join("");
78+
}
79+
7680
describe("getDocument", function () {
7781
it("creates pdf doc from URL-string", async function () {
7882
const urlStr = TEST_PDFS_PATH + basicApiFileName;
@@ -1604,11 +1608,17 @@ describe("api", function () {
16041608
const data = await Promise.all([defaultPromise, parametersPromise]);
16051609

16061610
expect(!!data[0].items).toEqual(true);
1607-
expect(data[0].items.length).toEqual(12);
1611+
expect(data[0].items.length).toEqual(11);
16081612
expect(!!data[0].styles).toEqual(true);
16091613

1614+
const page1 = mergeText(data[0].items);
1615+
expect(page1).toEqual(`Table Of Content
1616+
Chapter 1 .......................................................... 2
1617+
Paragraph 1.1 ...................................................... 3
1618+
page 1 / 3`);
1619+
16101620
expect(!!data[1].items).toEqual(true);
1611-
expect(data[1].items.length).toEqual(7);
1621+
expect(data[1].items.length).toEqual(6);
16121622
expect(!!data[1].styles).toEqual(true);
16131623
});
16141624

@@ -1643,6 +1653,107 @@ describe("api", function () {
16431653
await loadingTask.destroy();
16441654
});
16451655

1656+
it("gets text content, with no extra spaces (issue 13226)", async function () {
1657+
const loadingTask = getDocument(buildGetDocumentParams("issue13226.pdf"));
1658+
const pdfDoc = await loadingTask.promise;
1659+
const pdfPage = await pdfDoc.getPage(1);
1660+
const { items } = await pdfPage.getTextContent();
1661+
const text = mergeText(items);
1662+
1663+
expect(text).toEqual(
1664+
"Mitarbeiterinnen und Mitarbeiter arbeiten in über 100 Ländern engagiert im Dienste"
1665+
);
1666+
1667+
await loadingTask.destroy();
1668+
});
1669+
1670+
it("gets text content, with merged spaces (issue 13201)", async function () {
1671+
const loadingTask = getDocument(buildGetDocumentParams("issue13201.pdf"));
1672+
const pdfDoc = await loadingTask.promise;
1673+
const pdfPage = await pdfDoc.getPage(1);
1674+
const { items } = await pdfPage.getTextContent();
1675+
const text = mergeText(items);
1676+
1677+
expect(
1678+
text.includes(
1679+
"Abstract. A purely peer-to-peer version of electronic cash would allow online"
1680+
)
1681+
).toEqual(true);
1682+
expect(
1683+
text.includes(
1684+
"avoid mediating disputes. The cost of mediation increases transaction costs, limiting the"
1685+
)
1686+
).toEqual(true);
1687+
expect(
1688+
text.includes(
1689+
"system is secure as long as honest nodes collectively control more CPU power than any"
1690+
)
1691+
).toEqual(true);
1692+
1693+
await loadingTask.destroy();
1694+
});
1695+
1696+
it("gets text content, with no spaces between letters of words (issue 11913)", async function () {
1697+
const loadingTask = getDocument(buildGetDocumentParams("issue11913.pdf"));
1698+
const pdfDoc = await loadingTask.promise;
1699+
const pdfPage = await pdfDoc.getPage(1);
1700+
const { items } = await pdfPage.getTextContent();
1701+
const text = mergeText(items);
1702+
1703+
expect(
1704+
text.includes(
1705+
"1. The first of these cases arises from the tragic handicap which has blighted the life of the Plaintiff, and from the response of the"
1706+
)
1707+
).toEqual(true);
1708+
expect(
1709+
text.includes(
1710+
"argued in this Court the appeal raises narrower, but important, issues which may be summarised as follows:-"
1711+
)
1712+
).toEqual(true);
1713+
await loadingTask.destroy();
1714+
});
1715+
1716+
it("gets text content, with merged spaces (issue 10900)", async function () {
1717+
const loadingTask = getDocument(buildGetDocumentParams("issue10900.pdf"));
1718+
const pdfDoc = await loadingTask.promise;
1719+
const pdfPage = await pdfDoc.getPage(1);
1720+
const { items } = await pdfPage.getTextContent();
1721+
const text = mergeText(items);
1722+
1723+
expect(
1724+
text.includes(`3 3 3 3
1725+
851.5 854.9 839.3 837.5
1726+
633.6 727.8 789.9 796.2
1727+
1,485.1 1,582.7 1,629.2 1,633.7
1728+
114.2 121.7 125.3 130.7
1729+
13.0x 13.0x 13.0x 12.5x`)
1730+
).toEqual(true);
1731+
1732+
await loadingTask.destroy();
1733+
});
1734+
1735+
it("gets text content, with spaces (issue 10640)", async function () {
1736+
const loadingTask = getDocument(buildGetDocumentParams("issue10640.pdf"));
1737+
const pdfDoc = await loadingTask.promise;
1738+
const pdfPage = await pdfDoc.getPage(1);
1739+
const { items } = await pdfPage.getTextContent();
1740+
const text = mergeText(items);
1741+
1742+
expect(
1743+
text.includes(`Open Sans is a humanist sans serif typeface designed by Steve Matteson.
1744+
Open Sans was designed with an upright stress, open forms and a neu-
1745+
tral, yet friendly appearance. It was optimized for print, web, and mobile
1746+
interfaces, and has excellent legibility characteristics in its letterforms (see
1747+
figure \x81 on the following page). This font is available from the Google Font
1748+
Directory [\x81] as TrueType files licensed under the Apache License version \x82.\x80.
1749+
This package provides support for this font in LATEX. It includes Type \x81
1750+
versions of the fonts, converted for this package using FontForge from its
1751+
sources, for full support with Dvips.`)
1752+
).toEqual(true);
1753+
1754+
await loadingTask.destroy();
1755+
});
1756+
16461757
it("gets empty structure tree", async function () {
16471758
const tree = await page.getStructTree();
16481759

test/unit/pdf_find_controller_spec.js

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ describe("pdf_find_controller", function () {
268268
pageIndex: 0,
269269
matchIndex: 0,
270270
},
271-
pageMatches: [[19, 48, 66]],
271+
pageMatches: [[19, 46, 62]],
272272
pageMatchesLength: [[8, 8, 8]],
273273
});
274274
});

0 commit comments

Comments
 (0)