From 35eca2c5d2069e7b5fc8ab4734f1cd3fa6096cc9 Mon Sep 17 00:00:00 2001
From: ansuz <ansuz@transitiontech.ca>
Date: Fri, 21 Feb 2020 08:36:05 -0500
Subject: [PATCH] pull file streaming out into its own file, leave a few notes

---
 lib/storage/file.js | 122 +++++++++++++-------------------------------
 lib/stream-file.js  |  76 +++++++++++++++++++++++++++
 2 files changed, 112 insertions(+), 86 deletions(-)
 create mode 100644 lib/stream-file.js

diff --git a/lib/storage/file.js b/lib/storage/file.js
index b1ac4de3d..e98b17f94 100644
--- a/lib/storage/file.js
+++ b/lib/storage/file.js
@@ -10,11 +10,9 @@ var Util = require("../common-util");
 var Meta = require("../metadata");
 var Extras = require("../hk-util");
 
-const Schedule = require("../schedule");
-const Readline = require("readline");
-const ToPull = require('stream-to-pull-stream');
-const Pull = require('pull-stream');
+const readFileBin = require("../stream-file").readFileBin;
 
+const Schedule = require("../schedule");
 const isValidChannelId = function (id) {
     return typeof(id) === 'string' &&
         id.length >= 32 && id.length < 50 &&
@@ -60,13 +58,24 @@ var channelExists = function (filepath, cb) {
     });
 };
 
+// readMessagesBin asynchronously iterates over the messages in a channel log
+// the handler for each message must call back to read more, which should mean
+// that this function has a lower memory profile than our classic method
+// of reading logs line by line.
+// it also allows the handler to abort reading at any time
+const readMessagesBin = (env, id, start, msgHandler, cb) => {
+    const stream = Fs.createReadStream(mkPath(env, id), { start: start });
+    return void readFileBin(env, stream, msgHandler, cb);
+};
+
 // reads classic metadata from a channel log and aborts
 // returns undefined if the first message was not an object (not an array)
 var getMetadataAtPath = function (Env, path, _cb) {
-    var stream;
+    const stream = Fs.createReadStream(path, { start: 0 });
 
     // cb implicitly destroys the stream, if it exists
     // and calls back asynchronously no more than once
+    /*
     var cb = Util.once(Util.both(function () {
         try {
             stream.destroy();
@@ -74,20 +83,26 @@ var getMetadataAtPath = function (Env, path, _cb) {
             return err;
         }
     }, Util.mkAsync(_cb)));
+    */
 
-    // stream creation emit errors... probably ENOENT
-    stream = Fs.createReadStream(path, { encoding: 'utf8' }).on('error', cb);
-
-    // stream lines
-    const rl = Readline.createInterface({
-        input: stream,
+    var cb = Util.once(Util.mkAsync(_cb), function () {
+        throw new Error("Multiple Callbacks");
     });
 
     var i = 0;
-    rl
-    .on('line', function (line) {
+    return readFileBin(Env, stream, function (msgObj, readMore, abort) {
+        const line = msgObj.buff.toString('utf8');
+
+        if (!line) {
+            return readMore();
+        }
+
         // metadata should always be on the first line or not exist in the channel at all
-        if (i++ > 0) { return void cb(); }
+        if (i++ > 0) {
+            console.log("aborting");
+            abort();
+            return void cb();
+        }
         var metadata;
         try {
             metadata = JSON.parse(line);
@@ -102,9 +117,10 @@ var getMetadataAtPath = function (Env, path, _cb) {
             // if you can't parse, that's bad
             return void cb("INVALID_METADATA");
         }
-    })
-    .on('close', cb)
-    .on('error', cb);
+        readMore();
+    }, function (err) {
+        cb(err);
+    });
 };
 
 var closeChannel = function (env, channelName, cb) {
@@ -150,6 +166,7 @@ var clearChannel = function (env, channelId, _cb) {
 /*  readMessages is our classic method of reading messages from the disk
     notably doesn't provide a means of aborting if you finish early
 */
+// XXX replicate current API on top of readMessagesBin
 var readMessages = function (path, msgHandler, cb) {
     var remainder = '';
     var stream = Fs.createReadStream(path, { encoding: 'utf8' });
@@ -186,6 +203,7 @@ var getChannelMetadata = function (Env, channelId, cb) {
 // low level method for getting just the dedicated metadata channel
 var getDedicatedMetadata = function (env, channelId, handler, cb) {
     var metadataPath = mkMetadataPath(env, channelId);
+    // XXX use readFileBin
     readMessages(metadataPath, function (line) {
         if (!line) { return; }
         try {
@@ -266,75 +284,6 @@ var writeMetadata = function (env, channelId, data, cb) {
 };
 
 
-// transform a stream of arbitrarily divided data
-// into a stream of buffers divided by newlines in the source stream
-// TODO see if we could improve performance by using libnewline
-const NEWLINE_CHR = ('\n').charCodeAt(0);
-const mkBufferSplit = () => {
-    let remainder = null;
-    return Pull((read) => {
-        return (abort, cb) => {
-            read(abort, function (end, data) {
-                if (end) {
-                    if (data) { console.log("mkBufferSplit() Data at the end"); }
-                    cb(end, remainder ? [remainder, data] : [data]);
-                    remainder = null;
-                    return;
-                }
-                const queue = [];
-                for (;;) {
-                    const offset = data.indexOf(NEWLINE_CHR);
-                    if (offset < 0) {
-                        remainder = remainder ? Buffer.concat([remainder, data]) : data;
-                        break;
-                    }
-                    let subArray = data.slice(0, offset);
-                    if (remainder) {
-                        subArray = Buffer.concat([remainder, subArray]);
-                        remainder = null;
-                    }
-                    queue.push(subArray);
-                    data = data.slice(offset + 1);
-                }
-                cb(end, queue);
-            });
-        };
-    }, Pull.flatten());
-};
-
-// return a streaming function which transforms buffers into objects
-// containing the buffer and the offset from the start of the stream
-const mkOffsetCounter = () => {
-    let offset = 0;
-    return Pull.map((buff) => {
-        const out = { offset: offset, buff: buff };
-        // +1 for the eaten newline
-        offset += buff.length + 1;
-        return out;
-    });
-};
-
-// readMessagesBin asynchronously iterates over the messages in a channel log
-// the handler for each message must call back to read more, which should mean
-// that this function has a lower memory profile than our classic method
-// of reading logs line by line.
-// it also allows the handler to abort reading at any time
-const readMessagesBin = (env, id, start, msgHandler, cb) => {
-    const stream = Fs.createReadStream(mkPath(env, id), { start: start });
-    let keepReading = true;
-    Pull(
-        ToPull.read(stream),
-        mkBufferSplit(),
-        mkOffsetCounter(),
-        Pull.asyncMap((data, moreCb) => {
-            msgHandler(data, moreCb, () => { keepReading = false; moreCb(); });
-        }),
-        Pull.drain(() => (keepReading), (err) => {
-            cb((keepReading) ? err : undefined);
-        })
-    );
-};
-
 // check if a file exists at $path
 var checkPath = function (path, callback) {
     Fs.stat(path, function (err) {
@@ -428,6 +377,7 @@ var removeArchivedChannel = function (env, channelName, cb) {
     });
 };
 
+// XXX use ../plan.js
 var listChannels = function (root, handler, cb) {
     // do twenty things at a time
     var sema = Semaphore.create(20);
diff --git a/lib/stream-file.js b/lib/stream-file.js
new file mode 100644
index 000000000..12322d868
--- /dev/null
+++ b/lib/stream-file.js
@@ -0,0 +1,76 @@
+/* jshint esversion: 6 */
+/* global Buffer */
+
+const ToPull = require('stream-to-pull-stream');
+const Pull = require('pull-stream');
+
+const Stream = module.exports;
+
+// transform a stream of arbitrarily divided data
+// into a stream of buffers divided by newlines in the source stream
+// TODO see if we could improve performance by using libnewline
+const NEWLINE_CHR = ('\n').charCodeAt(0);
+const mkBufferSplit = () => {
+    let remainder = null;
+    return Pull((read) => {
+        return (abort, cb) => {
+            read(abort, function (end, data) {
+                if (end) {
+                    if (data) { console.log("mkBufferSplit() Data at the end"); }
+                    cb(end, remainder ? [remainder, data] : [data]);
+                    remainder = null;
+                    return;
+                }
+                const queue = [];
+                for (;;) {
+                    const offset = data.indexOf(NEWLINE_CHR);
+                    if (offset < 0) {
+                        remainder = remainder ? Buffer.concat([remainder, data]) : data;
+                        break;
+                    }
+                    let subArray = data.slice(0, offset);
+                    if (remainder) {
+                        subArray = Buffer.concat([remainder, subArray]);
+                        remainder = null;
+                    }
+                    queue.push(subArray);
+                    data = data.slice(offset + 1);
+                }
+                cb(end, queue);
+            });
+        };
+    }, Pull.flatten());
+};
+
+// return a streaming function which transforms buffers into objects
+// containing the buffer and the offset from the start of the stream
+const mkOffsetCounter = () => {
+    let offset = 0;
+    return Pull.map((buff) => {
+        const out = { offset: offset, buff: buff };
+        // +1 for the eaten newline
+        offset += buff.length + 1;
+        return out;
+    });
+};
+
+// readMessagesBin asynchronously iterates over the messages in a channel log
+// the handler for each message must call back to read more, which should mean
+// that this function has a lower memory profile than our classic method
+// of reading logs line by line.
+// it also allows the handler to abort reading at any time
+Stream.readFileBin = (env, stream, msgHandler, cb) => {
+    //const stream = Fs.createReadStream(path, { start: start });
+    let keepReading = true;
+    Pull(
+        ToPull.read(stream),
+        mkBufferSplit(),
+        mkOffsetCounter(),
+        Pull.asyncMap((data, moreCb) => {
+            msgHandler(data, moreCb, () => { keepReading = false; moreCb(); });
+        }),
+        Pull.drain(() => (keepReading), (err) => {
+            cb((keepReading) ? err : undefined);
+        })
+    );
+};