feed: enable ignoring duplicate textnotes option
#54
reduce-duplicates
into master
@ -257,9 +257,36 @@ document.body.addEventListener('click', (e) => {
|
|||||||
|
|
||||||
const textNoteList = []; // could use indexDB
|
const textNoteList = []; // could use indexDB
|
||||||
const eventRelayMap = {}; // eventId: [relay1, relay2]
|
const eventRelayMap = {}; // eventId: [relay1, relay2]
|
||||||
|
const replyList = [];
|
||||||
|
const reactionMap = {};
|
||||||
const hasEventTag = tag => tag[0] === 'e';
|
const hasEventTag = tag => tag[0] === 'e';
|
||||||
|
|
||||||
|
const dropDuplicateToggle = document.querySelector('#duplicates');
|
||||||
|
let dropDuplicate = JSON.parse(localStorage.getItem('filter_duplicates')) ?? true;
|
||||||
|
dropDuplicateToggle.addEventListener('click', (e) => {
|
||||||
|
localStorage.setItem('filter_duplicates', e.target.checked);
|
||||||
|
dropDuplicate = e.target.checked;
|
||||||
|
});
|
||||||
|
dropDuplicateToggle.checked = dropDuplicate;
|
||||||
|
|
||||||
|
function isValidNote(evt) {
|
||||||
|
|||||||
|
if (dropDuplicate) {
|
||||||
|
const similarEvents = [
|
||||||
|
...textNoteList.filter(({content}) => content === evt.content),
|
||||||
x1ddos
commented 2 years ago
Review
maybe i'm missing something but looks like
loops over all text notes regardless of the timestamp, and only after is it filtered by or is it not how it works? maybe i'm missing something but looks like `textNoteList` contains all events received since a browser window is open, can easily be older than 1h because `textNoteList.push(evt)` happens in `handleTextNote` every time, and so this filter here:
> textNoteList.filter(({content}) => content === evt.content)
loops over all text notes regardless of the timestamp, and only after is it filtered by `({created_at}) => ((created_at + 3600) > evt.created_at)`, but this last filter is run after looping over all `textNoteList` items.
or is it not how it works?
offbyn
commented 2 years ago
Review
but now thinking indexDB, I guess it should just all be in 1 talbe in indexDB, wdyt? `textNoteList.push(evt)` is in a condition when the event
has some event tags it goes to `handleReply`, at the time I thought better have 2 different array, so they don't get too big and it is always clear if it is an reply or not.
https://git.qcode.ch/nostr/nostrweb/src/commit/89c54ac08fb67cb9f0040812fd63a3a1917e79dc/src/main.js#L267
but now thinking indexDB, I guess it should just all be in 1 talbe in indexDB, wdyt?
x1ddos
commented 2 years ago
Review
re: indexeddb. yeah, i guess 2 additional columns, some one other benefit of indexeddb imho is it works in web workers - related to push notifications in #25. of course, indexeddb doesn't seem as easy as localstorage, judging by an example in https://github.com/mdn/dom-examples/blob/main/to-do-notifications/scripts/todo.js (from https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API) re: indexeddb. yeah, i guess 2 additional columns, some `in_reply_to` and `root`? then sounds like it could work with a single table.
one other benefit of indexeddb imho is it works in web workers - related to push notifications in https://git.qcode.ch/nostr/nostrweb/issues/25.
of course, indexeddb doesn't seem as easy as localstorage, judging by an example in https://github.com/mdn/dom-examples/blob/main/to-do-notifications/scripts/todo.js (from https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API)
x1ddos
commented 2 years ago
Review
but even with indexeddb... suppose you store all events in an
not sure if it'll actually help because:
wouldn't it make more sense to compare some but even with indexeddb... suppose you store all events in an `events` table. then how would it work to count duplicate messages. i imaging something like this:
```sql
select count(*) from events where content = <some-content>
```
not sure if it'll actually help because:
- content can be quite large
- you wouldn't want to index on content column which means the sql query above will be slow, maybe even slower that looping over the `textNoteList` array
wouldn't it make more sense to compare some `hash(content)`?
offbyn
commented 2 years ago
Review
makes sense, since nostr-tools is already using noble hases anyway is sha256 overkill?
https://www.npmjs.com/package/@noble/hashes I'll look also in cleaning up whitespaces before hashing. makes sense, since nostr-tools is already using noble hases anyway
is sha256 overkill?
```js
import { sha256 } from '@noble/hashes/sha256';
console.log(sha256('abc')));
```
```
├─┬ nostr-tools@0.24.1
│ ├── @noble/hashes@0.5.
```
https://www.npmjs.com/package/@noble/hashes
I'll look also in cleaning up whitespaces before hashing.
x1ddos
commented 2 years ago
Review
here's an alternative. haven't measured but i think it would be the fastest:
however, even that doesn't work. i mean, it'll produce too many false negatives. imagine 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should. i still think #51 is a better way. here's an alternative. haven't measured but i think it would be the fastest:
```js
const dups = {};
function countEventDup(content) {
const key = hash(content);
let elem = dups[key];
if (!elem) {
dups[key] = {n: 1, lastseen: Date.now()};
return 1;
}
elem.n += 1;
return elem.n;
}
function isValidNote(evt) {
return countEventDup(evt.content) < 5;
}
```
however, even that doesn't work. i mean, it'll produce too many false negatives. imagine 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should. i still think https://git.qcode.ch/nostr/nostrweb/issues/51#issuecomment-519 is a better way.
x1ddos
commented 2 years ago
Review
actually, they don't have to be replies. just random people saying "hi" or "gm", on different relays. why should they suffer? > imagine 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should
actually, they don't have to be replies. just random people saying "hi" or "gm", on different relays. why should they suffer?
offbyn
commented 2 years ago
Review
true then it will drop the new ones, 5 is arbitrary ofcorse. also after reloading the newest 5 will render and not show the first 5.
they are very different, it's not mutually exclusive or do you think conceptually it's better to have no options in the client itself? > 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should
true then it will drop the new ones, 5 is arbitrary ofcorse.
also after reloading the newest 5 will render and not show the first 5.
> i still think #51 is a better way.
they are very different, it's not mutually exclusive or do you think conceptually it's better to have no options in the client itself?
x1ddos
commented 2 years ago
Review
i guess i'm hoping one day i won't have to reload the page :)
but the same motivation, right? i'm just thinking. imagine i'm connected to a relay and follow alice. waiting for her to say "hi". but there are other 100 people saying "hi" all the time. so, i'll never see the "hi" from alice. with that scenario, i think there should be more in the algorithm. for example, count duplicates only from pubkeys not in the contacts list, so only the public feed. otherwise, the risk is high "important" messages are swallowed. either way, let's add this. even temporary, maybe it'll help a bit. > also after reloading the newest 5 will render and not show the first 5.
i guess i'm hoping one day i won't have to reload the page :)
> they are very different, it's not mutually exclusive or do you think conceptually it's better to have no options in the client itself?
but the same motivation, right?
i'm just thinking. imagine i'm connected to a relay and follow alice. waiting for her to say "hi". but there are other 100 people saying "hi" all the time. so, i'll never see the "hi" from alice.
with that scenario, i think there should be more in the algorithm. for example, count duplicates only from pubkeys not in the contacts list, so only the public feed. otherwise, the risk is high "important" messages are swallowed.
either way, let's add this. even temporary, maybe it'll help a bit.
offbyn
commented 2 years ago
Review
indeed I should keep that more in mind, it's shouldn't just explode after some days, currently memory prob just grows forever. indexdb could drop old events maybe 🤔 , ( > i guess i'm hoping one day i won't have to reload the page :)
indeed I should keep that more in mind, it's shouldn't just explode after some days, currently memory prob just grows forever.
indexdb could drop old events maybe 🤔 , (`array.unshift()`).
x1ddos
commented 2 years ago
Review
definitely. well, from the page memory. then i'd imagine in the future, i can scroll back and forth and the client requests older/newer events and re-render. > drop old events
definitely. well, from the page memory. then i'd imagine in the future, i can scroll back and forth and the client requests older/newer events and re-render.
offbyn
commented 2 years ago
Review
this is the way 👍 > n the future, i can scroll back and forth and the client requests older/newer events and re-render.
this is the way 👍
|
|||||||
|
...replyList.filter(({content}) => content === evt.content),
|
||||||
|
].filter(({created_at}) => ((created_at + 3600) > evt.created_at));
|
||||||
|
if (similarEvents?.length >= 5) {
|
||||||
|
console.info(`DROP event with content: "${evt.content.trim()}" already got ${similarEvents.length} similar events withing the last hour`, similarEvents, `\n dropped event id: ${evt.id}`);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
function handleTextNote(evt, relay) {
|
function handleTextNote(evt, relay) {
|
||||||
|
if (!isValidNote(evt)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (eventRelayMap[evt.id]) {
|
if (eventRelayMap[evt.id]) {
|
||||||
eventRelayMap[evt.id] = [relay, ...(eventRelayMap[evt.id])];
|
eventRelayMap[evt.id] = [relay, ...(eventRelayMap[evt.id])];
|
||||||
} else {
|
} else {
|
||||||
@ -273,9 +300,6 @@ function handleTextNote(evt, relay) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const replyList = [];
|
|
||||||
const reactionMap = {};
|
|
||||||
|
|
||||||
const getReactionList = (id) => {
|
const getReactionList = (id) => {
|
||||||
return reactionMap[id]?.map(({content}) => content) || [];
|
return reactionMap[id]?.map(({content}) => content) || [];
|
||||||
};
|
};
|
||||||
@ -357,6 +381,7 @@ setInterval(() => {
|
|||||||
}, 10000);
|
}, 10000);
|
||||||
|
|
||||||
const getNoxyUrl = (type, url, id, relay) => {
|
const getNoxyUrl = (type, url, id, relay) => {
|
||||||
|
return false;
|
||||||
if (!isHttpUrl(url)) {
|
if (!isHttpUrl(url)) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
@ -371,6 +396,9 @@ const fetchQue = [];
|
|||||||
let fetchPending;
|
let fetchPending;
|
||||||
const fetchNext = (href, id, relay) => {
|
const fetchNext = (href, id, relay) => {
|
||||||
const noxy = getNoxyUrl('meta', href, id, relay);
|
const noxy = getNoxyUrl('meta', href, id, relay);
|
||||||
|
if (!noxy) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
const previewId = noxy.searchParams.toString();
|
const previewId = noxy.searchParams.toString();
|
||||||
if (fetchPending) {
|
if (fetchPending) {
|
||||||
fetchQue.push({href, id, relay});
|
fetchQue.push({href, id, relay});
|
||||||
|
all looks good except this. i see a couple potential issues:
the complexity is O(N). as the number of notes increases, so is the average run time of the loop and it's run for each text note...; how about using an object or something in this pseudo-code:
localStorage.setItem(content_hash, content_hash_counter+1)
. the "// could use indexDB" comment ontextNoteList = []
is a good hint :)there's no eviction of "old" items. suppose one says "gm" once a day. after 5th day, their "gm" text note won't make it anymore, assuming the browser window stays open for 5 days. so, maybe have some kind of a queue where old items are expunged after some time?
i actually meant, where a counter of duplicate notes resets or decreases
I guess it's time to try indexDB, should have done earlier. I think it makes more sense to finally go indexDB instead of content hash workaround.
yep, already seen dropped events with the content "hi" loading only last 24h. the check could only take events into account that are younger than < 1h, this should work ok-ish for keeping the client open for days, I guess + with this change it wouldn't need to check all events but just the newest.