From 9e1d82f373330f7bf67c59667c2c7d2f1509ec28 Mon Sep 17 00:00:00 2001 From: ansuz Date: Wed, 17 Feb 2021 14:17:41 +0530 Subject: [PATCH] move archive deletion out of the main eviction script --- lib/eviction.js | 176 ++++++++++++++++++++++++-------------- scripts/evict-archived.js | 102 ++++++++++++++++++++++ 2 files changed, 213 insertions(+), 65 deletions(-) create mode 100644 scripts/evict-archived.js diff --git a/lib/eviction.js b/lib/eviction.js index 84db87d13..673c7313f 100644 --- a/lib/eviction.js +++ b/lib/eviction.js @@ -30,8 +30,16 @@ Env = { */ -module.exports = function (Env, cb) { - var complete = Util.once(Util.mkAsync(cb)); +// the number of ms artificially introduced between CPU-intensive operations +var THROTTLE_FACTOR = 10; + +var evictArchived = function (Env, cb) { + var Log; + var store; + var pinStore; + var blobs; + var retentionTime = +new Date() - (Env.archiveRetentionTime * 24 * 3600 * 1000); + var report = { // archivedChannelsRemoved, // archivedAccountsRemoved, @@ -53,67 +61,7 @@ module.exports = function (Env, cb) { // runningTime, }; - // the administrator should have set an 'inactiveTime' in their config - // if they didn't, just exit. - if (!Env.inactiveTime || typeof(Env.inactiveTime) !== "number") { - return void complete("NO_INACTIVE_TIME"); - } - // get a list of premium accounts on this instance - // pre-converted to the 'safeKey' format so we can easily compare - // them against ids we see on the filesystem - var premiumSafeKeys = Object.keys(Env.limits || {}) - .map(function (id) { - return Keys.canonicalize(id); - }) - .filter(Boolean) - .map(Util.escapeKeyCharacters); - - // files which have not been changed since before this date can be considered inactive - var inactiveTime = +new Date() - (Env.inactiveTime * 24 * 3600 * 1000); - - // files which were archived before this date can be considered safe to remove - var retentionTime = +new Date() - (Env.archiveRetentionTime * 24 * 3600 * 1000); - - var store; - var pinStore; - var Log; - var blobs; - - /* It's fairly easy to know if a channel or blob is active - but knowing whether it is pinned requires that we - keep the set of pinned documents in memory. - - Some users will share the same set of documents in their pin lists, - so the representation of pinned documents should scale sub-linearly - with the number of users and pinned documents. - - That said, sub-linear isn't great... - A Bloom filter is "a space-efficient probabilistic data structure" - which lets us check whether an item is _probably_ or _definitely not_ - in a set. This is good enough for our purposes since we just want to - know whether something can safely be removed and false negatives - (not safe to remove when it actually is) are acceptable. - - We set our capacity to some large number, and the error rate to whatever - we think is acceptable. - - TODO make this configurable ? - */ - var BLOOM_CAPACITY = (1 << 20) - 1; // over a million items - var BLOOM_ERROR = 1 / 10000; // an error rate of one in a thousand - // the number of ms artificially introduced between CPU-intensive operations - var THROTTLE_FACTOR = 10; - - // we'll use one filter for the set of active documents - var activeDocs = Bloom.optimalFilter(BLOOM_CAPACITY, BLOOM_ERROR); - // and another one for the set of pinned documents - var pinnedDocs = Bloom. optimalFilter(BLOOM_CAPACITY, BLOOM_ERROR); - - var startTime = +new Date(); - var msSinceStart = function () { - return (+new Date()) - startTime; - }; var loadStorage = function () { store = Env.store; @@ -237,6 +185,105 @@ module.exports = function (Env, cb) { })); }; + nThen(loadStorage) + .nThen(removeArchivedChannels) + .nThen(removeArchivedBlobProofs) + .nThen(removeArchivedBlobs) + .nThen(function () { + cb(); + }); +}; + +module.exports = function (Env, cb) { + var complete = Util.once(Util.mkAsync(cb)); + var report = { + // archivedChannelsRemoved, + // archivedAccountsRemoved, + // archivedBlobProofsRemoved, + // archivedBlobsRemoved, + + // totalChannels, + // activeChannels, + + // totalBlobs, + // activeBlobs, + + // totalAccounts, + // activeAccounts, + + // channelsArchived, + + launchTime: +new Date(), + // runningTime, + }; + + // the administrator should have set an 'inactiveTime' in their config + // if they didn't, just exit. + if (!Env.inactiveTime || typeof(Env.inactiveTime) !== "number") { + return void complete("NO_INACTIVE_TIME"); + } + + // get a list of premium accounts on this instance + // pre-converted to the 'safeKey' format so we can easily compare + // them against ids we see on the filesystem + var premiumSafeKeys = Object.keys(Env.limits || {}) + .map(function (id) { + return Keys.canonicalize(id); + }) + .filter(Boolean) + .map(Util.escapeKeyCharacters); + + // files which have not been changed since before this date can be considered inactive + var inactiveTime = +new Date() - (Env.inactiveTime * 24 * 3600 * 1000); + + // files which were archived before this date can be considered safe to remove + var retentionTime = +new Date() - (Env.archiveRetentionTime * 24 * 3600 * 1000); + + var store; + var pinStore; + var Log; + var blobs; + + /* It's fairly easy to know if a channel or blob is active + but knowing whether it is pinned requires that we + keep the set of pinned documents in memory. + + Some users will share the same set of documents in their pin lists, + so the representation of pinned documents should scale sub-linearly + with the number of users and pinned documents. + + That said, sub-linear isn't great... + A Bloom filter is "a space-efficient probabilistic data structure" + which lets us check whether an item is _probably_ or _definitely not_ + in a set. This is good enough for our purposes since we just want to + know whether something can safely be removed and false negatives + (not safe to remove when it actually is) are acceptable. + + We set our capacity to some large number, and the error rate to whatever + we think is acceptable. + + TODO make this configurable ? + */ + var BLOOM_CAPACITY = (1 << 20) - 1; // over a million items + var BLOOM_ERROR = 1 / 10000; // an error rate of one in a thousand + + // we'll use one filter for the set of active documents + var activeDocs = Bloom.optimalFilter(BLOOM_CAPACITY, BLOOM_ERROR); + // and another one for the set of pinned documents + var pinnedDocs = Bloom. optimalFilter(BLOOM_CAPACITY, BLOOM_ERROR); + + var startTime = +new Date(); + var msSinceStart = function () { + return (+new Date()) - startTime; + }; + + var loadStorage = function () { + store = Env.store; + pinStore = Env.pinStore; + Log = Env.Log; + blobs = Env.blobStore; + }; + var categorizeChannelsByActivity = function (w) { var channels = 0; var active = 0; @@ -566,9 +613,6 @@ module.exports = function (Env, cb) { }; nThen(loadStorage) - .nThen(removeArchivedChannels) - .nThen(removeArchivedBlobProofs) - .nThen(removeArchivedBlobs) // iterate over all documents and add them to a bloom filter if they have been active .nThen(categorizeChannelsByActivity) @@ -590,3 +634,5 @@ module.exports = function (Env, cb) { complete(void 0, report); }); }; + +module.exports.archived = evictArchived; diff --git a/scripts/evict-archived.js b/scripts/evict-archived.js new file mode 100644 index 000000000..7f90f9ff5 --- /dev/null +++ b/scripts/evict-archived.js @@ -0,0 +1,102 @@ +var Eviction = require("../lib/eviction"); +var nThen = require("nthen"); +var Store = require("../lib/storage/file"); +var BlobStore = require("../lib/storage/blob"); + +var Quota = require("../lib/commands/quota"); +var Environment = require("../lib/env"); +var Decrees = require("../lib/decrees"); + +var config = require("../lib/load-config"); + +var Env = Environment.create(config); + +var loadPremiumAccounts = function (Env, cb) { + nThen(function (w) { + // load premium accounts + Quota.updateCachedLimits(Env, w(function (err) { + if (err) { + Env.Log.error('EVICT_LOAD_PREMIUM_ACCOUNTS', { + error: err, + }); + } + })); + }).nThen(function (w) { + // load and apply decrees + Decrees.load(Env, w(function (err) { + if (err) { + Env.Log.error('EVICT_LOAD_DECREES', { + error: err.code || err, + message: err.message, + }); + } + })); + }).nThen(function () { + //console.log(Env.limits); + cb(); + }); +}; + +var prepareEnv = function (Env, cb) { + //Quota.applyCustomLimits(Env); + + nThen(function (w) { + /* Database adaptors + */ + + // load the store which will be used for iterating over channels + // and performing operations like archival and deletion + Store.create(config, w(function (err, _) { + if (err) { + w.abort(); + throw err; + } + Env.store = _; + })); + + Store.create({ + filePath: config.pinPath, + }, w(function (err, _) { + if (err) { + w.abort(); + throw err; + } + Env.pinStore = _; + })); + + // load the logging module so that you have a record of which + // files were archived or deleted at what time + var Logger = require("../lib/log"); + Logger.create(config, w(function (_) { + Env.Log = _; + })); + + config.getSession = function () {}; + BlobStore.create(config, w(function (err, _) { + if (err) { + w.abort(); + return console.error(err); + } + Env.blobStore = _; + })); + }).nThen(function (w) { + loadPremiumAccounts(Env, w(function (/* err */) { + //if (err) { } + })); + }).nThen(function () { + cb(); + }); +}; + +//console.log("starting"); +nThen(function (w) { + // load database adaptors and configuration values into the environment + prepareEnv(Env, w(function () { + //console.log("env prepared"); + + })); +}).nThen(function (w) { + Eviction.archived(Env, w(function () { + + })); +});