Skip to content

Commit 4d0c72c

Browse files
fix(algolia): chunk and validate size of data (#220)
* fix(algolia): chunk and validate size of data * refactor(algolia): truncate post content instead of skipping it * refactor(algolia): small tweaks to indexing script * refactor(algolia): give all objects uuid and date when indexed so we can update them and remove old pages easily
1 parent ba97423 commit 4d0c72c

File tree

5 files changed

+102
-4
lines changed

5 files changed

+102
-4
lines changed

algolia.js

Lines changed: 62 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,40 @@
1515
*/
1616
require('dotenv').config();
1717
const algoliasearch = require('algoliasearch');
18+
1819
const fs = require('fs');
20+
const {sizeof} = require('sizeof');
21+
22+
const maxChunkSizeInBytes = 10000000; // 10,000,000
23+
24+
/**
25+
* Chunks array of AlgoliaCollectionItem into array of array of AlgoliaCollectionItem smaller than 10 MB.
26+
*
27+
* @param {AlgoliaCollectionItem[]} arr
28+
* @return {AlgoliaCollectionItem[][]}
29+
*/
30+
const chunkAlgolia = arr => {
31+
const chunked = [];
32+
let tempSizeInBytes = 0;
33+
let temp = [];
34+
for (const current of arr) {
35+
const currentSizeInBytes = sizeof(current);
36+
if (tempSizeInBytes + currentSizeInBytes < maxChunkSizeInBytes) {
37+
temp.push(current);
38+
tempSizeInBytes += currentSizeInBytes;
39+
} else {
40+
chunked.push(temp);
41+
temp = [current];
42+
tempSizeInBytes = currentSizeInBytes;
43+
}
44+
}
45+
chunked.push(temp);
46+
return chunked;
47+
};
1948

2049
async function index() {
50+
const indexedOn = new Date();
51+
2152
if (!process.env.ALGOLIA_APP_ID || !process.env.ALGOLIA_API_KEY) {
2253
console.warn('Missing Algolia environment variables, skipping indexing.');
2354
return;
@@ -26,17 +57,45 @@ async function index() {
2657
const raw = fs.readFileSync('dist/algolia.json', 'utf-8');
2758
const algoliaData = JSON.parse(raw);
2859

60+
// Set date of when object is being added to algolia
61+
algoliaData.map(e => {
62+
e.indexedOn = indexedOn.getTime();
63+
return e;
64+
});
65+
66+
const chunkedAlgoliaData = chunkAlgolia(algoliaData);
67+
const postsCount = algoliaData.length;
68+
2969
const client = algoliasearch(
3070
process.env.ALGOLIA_APP_ID,
3171
process.env.ALGOLIA_API_KEY
3272
);
3373
const index = client.initIndex('prod_developer_chrome');
3474

35-
console.log(`Indexing ${algoliaData.length} articles`);
75+
console.log(
76+
`Indexing ${postsCount} articles amongst ${chunkedAlgoliaData.length} chunk(s).`
77+
);
78+
79+
// When indexing data we mark these two fields as fields that can be filtered by.
80+
await index.setSettings({
81+
attributesForFaceting: ['locale', 'tags'],
82+
});
83+
84+
// Update algolia index with new data
85+
for (let i = 0; i < chunkedAlgoliaData.length; i++) {
86+
await index.saveObjects(chunkedAlgoliaData[i], {
87+
autoGenerateObjectIDIfNotExist: true,
88+
});
89+
}
90+
91+
console.log('Updated algolia data.');
3692

37-
await index.replaceAllObjects(algoliaData, {
38-
autoGenerateObjectIDIfNotExist: true,
93+
console.log('Deleting old data no longer in algolia.json.');
94+
await index.deleteBy({
95+
filters: `indexedOn < ${indexedOn.getTime()}`,
3996
});
97+
console.log('Deleted old data.');
98+
4099
console.log('Done!');
41100
}
42101

package-lock.json

Lines changed: 12 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

package.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@
7272
"ava": "^3.12.1",
7373
"chalk": "^4.1.0",
7474
"cheerio": "^1.0.0-rc.3",
75+
"crypto": "^1.0.1",
7576
"csso": "^4.0.3",
7677
"dotenv": "^8.2.0",
7778
"eslint-plugin-ava": "^11.0.0",
@@ -103,6 +104,7 @@
103104
"rimraf": "^3.0.2",
104105
"rollup-plugin-copy": "^3.3.0",
105106
"rollup-plugin-terser": "^7.0.2",
107+
"sizeof": "^1.0.0",
106108
"stylelint": "^13.7.0",
107109
"stylelint-config-sass-guidelines": "^7.1.0",
108110
"typescript": "^3.8.3",

site/_collections/algolia.js

Lines changed: 25 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,31 @@
1515
*/
1616

1717
const removeMarkdown = require('remove-markdown');
18+
const {createHash} = require('crypto');
19+
1820
const {generateSrc} = require('../_shortcodes/img');
1921

22+
/**
23+
* Shrink the size of the given fulltext to fit within a certain limit, at the
24+
* nearest found newline character.
25+
*
26+
* @param {string} content
27+
* @param {number} [limit]
28+
* @return {string}
29+
*/
30+
function limitText(content, limit = 7500) {
31+
if (content.length <= limit) {
32+
return content;
33+
}
34+
35+
// Find the nearest prior newline to the 10k limit.
36+
let newlineIndex = content.lastIndexOf('\n', limit);
37+
if (newlineIndex === -1) {
38+
newlineIndex = limit;
39+
}
40+
return content.slice(0, newlineIndex);
41+
}
42+
2043
/**
2144
* @param {EleventyCollectionObject} collections
2245
* @returns {AlgoliaCollectionItem[]}
@@ -38,11 +61,12 @@ module.exports = collections => {
3861
algoliaCollectionItems.push({
3962
title: item.data.title,
4063
description: item.data.description,
41-
content: removeMarkdown(item.template.frontMatter.content),
64+
content: limitText(removeMarkdown(item.template.frontMatter.content)),
4265
url: item.url,
4366
tags: item.data.tags || [],
4467
locale: item.data.locale,
4568
photo: item.data.hero && generateSrc(item.data.hero),
69+
objectID: createHash('md5').update(item.url).digest('hex'),
4670
});
4771
}
4872
return algoliaCollectionItems;

types/site/_collections/algolia.d.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ declare global {
2323
tags: string[];
2424
locale: string;
2525
photo?: string;
26+
objectID: string;
2627
}
2728
}
2829

0 commit comments

Comments
 (0)