feed: enable ignoring duplicate textnotes option #54

Open
offbyn wants to merge 2 commits from reduce-duplicates into master

@ -10,7 +10,7 @@
}
@media (orientation: portrait) {
.mbox {
padding: 0 calc(.5 * var(--gap));
padding: 0 var(--gap-half);
}
}
.mbox:last-child {

@ -187,6 +187,16 @@ button:disabled {
flex-grow: 0;
}
input[type="checkbox"] {
margin: 0;
}
label.checkbox {
align-items: baseline;
display: flex;
gap: var(--gap-half);
}
button#publish {
align-self: end;
order: 2;

@ -83,6 +83,15 @@
<button type="submit" name="publish" tabindex="0" disabled>publish</button>
</div>
</form>
<form action="#" name="options">
<label class="checkbox" for="duplicates">
<input type="checkbox" name="dropDuplicate" id="duplicates">
<span>
drop duplicate events<br>
<small>ignore events that have already been rendered 5&nbsp;times within the last hour</small>
</span>
</label>
</form>
<form action="#" name="settings" autocomplete="new-password">
<label for="pubkey">public-key</label>
<input type="text" id="pubkey" autocomplete="off">

@ -14,6 +14,7 @@
--focus-outline: var(--focus-outline-width) var(--focus-outline-style) var(--focus-outline-color);
--font-small: 1.2rem;
--gap: 2.4rem;
--gap-half: 1.2rem;
}
::selection {

@ -257,9 +257,36 @@ document.body.addEventListener('click', (e) => {
const textNoteList = []; // could use indexDB
const eventRelayMap = {}; // eventId: [relay1, relay2]
const replyList = [];
const reactionMap = {};
const hasEventTag = tag => tag[0] === 'e';
const dropDuplicateToggle = document.querySelector('#duplicates');
let dropDuplicate = JSON.parse(localStorage.getItem('filter_duplicates')) ?? true;
dropDuplicateToggle.addEventListener('click', (e) => {
localStorage.setItem('filter_duplicates', e.target.checked);
dropDuplicate = e.target.checked;
});
dropDuplicateToggle.checked = dropDuplicate;
function isValidNote(evt) {
x1ddos commented 2 years ago
Review

all looks good except this. i see a couple potential issues:

  1. the complexity is O(N). as the number of notes increases, so is the average run time of the loop and it's run for each text note...; how about using an object or something in this pseudo-code: localStorage.setItem(content_hash, content_hash_counter+1). the "// could use indexDB" comment on textNoteList = [] is a good hint :)

  2. there's no eviction of "old" items. suppose one says "gm" once a day. after 5th day, their "gm" text note won't make it anymore, assuming the browser window stays open for 5 days. so, maybe have some kind of a queue where old items are expunged after some time?

all looks good except this. i see a couple potential issues: 1. the complexity is O(N). as the number of notes increases, so is the average run time of the loop and it's run for each text note...; how about using an object or something in this pseudo-code: `localStorage.setItem(content_hash, content_hash_counter+1)`. the "// could use indexDB" comment on `textNoteList = []` is a good hint :) 2. there's no eviction of "old" items. suppose one says "gm" once a day. after 5th day, their "gm" text note won't make it anymore, assuming the browser window stays open for 5 days. so, maybe have some kind of a queue where old items are expunged after some time?
x1ddos commented 2 years ago
Review

where old items are expunged after some time?

i actually meant, where a counter of duplicate notes resets or decreases

> where old items are expunged after some time? i actually meant, where a counter of duplicate notes resets or decreases
offbyn commented 2 years ago
Review
  1. I guess it's time to try indexDB, should have done earlier. I think it makes more sense to finally go indexDB instead of content hash workaround.

  2. yep, already seen dropped events with the content "hi" loading only last 24h. the check could only take events into account that are younger than < 1h, this should work ok-ish for keeping the client open for days, I guess + with this change it wouldn't need to check all events but just the newest.

1. I guess it's time to try indexDB, should have done earlier. I think it makes more sense to finally go indexDB instead of content hash workaround. 2. yep, already seen dropped events with the content "hi" loading only last 24h. the check could only take events into account that are younger than < 1h, this should work ok-ish for keeping the client open for days, I guess + with this change it wouldn't need to check all events but just the newest.
if (dropDuplicate) {
const similarEvents = [
...textNoteList.filter(({content}) => content === evt.content),
x1ddos commented 2 years ago
Review

maybe i'm missing something but looks like textNoteList contains all events received since a browser window is open, can easily be older than 1h because textNoteList.push(evt) happens in handleTextNote every time, and so this filter here:

textNoteList.filter(({content}) => content === evt.content)

loops over all text notes regardless of the timestamp, and only after is it filtered by ({created_at}) => ((created_at + 3600) > evt.created_at), but this last filter is run after looping over all textNoteList items.

or is it not how it works?

maybe i'm missing something but looks like `textNoteList` contains all events received since a browser window is open, can easily be older than 1h because `textNoteList.push(evt)` happens in `handleTextNote` every time, and so this filter here: > textNoteList.filter(({content}) => content === evt.content) loops over all text notes regardless of the timestamp, and only after is it filtered by `({created_at}) => ((created_at + 3600) > evt.created_at)`, but this last filter is run after looping over all `textNoteList` items. or is it not how it works?
offbyn commented 2 years ago
Review

textNoteList.push(evt) is in a condition when the event
has some event tags it goes to handleReply, at the time I thought better have 2 different array, so they don't get too big and it is always clear if it is an reply or not.

89c54ac08f/src/main.js (L267)

but now thinking indexDB, I guess it should just all be in 1 talbe in indexDB, wdyt?

`textNoteList.push(evt)` is in a condition when the event has some event tags it goes to `handleReply`, at the time I thought better have 2 different array, so they don't get too big and it is always clear if it is an reply or not. https://git.qcode.ch/nostr/nostrweb/src/commit/89c54ac08fb67cb9f0040812fd63a3a1917e79dc/src/main.js#L267 but now thinking indexDB, I guess it should just all be in 1 talbe in indexDB, wdyt?
x1ddos commented 2 years ago
Review

re: indexeddb. yeah, i guess 2 additional columns, some in_reply_to and root? then sounds like it could work with a single table.

one other benefit of indexeddb imho is it works in web workers - related to push notifications in #25.

of course, indexeddb doesn't seem as easy as localstorage, judging by an example in https://github.com/mdn/dom-examples/blob/main/to-do-notifications/scripts/todo.js (from https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API)

re: indexeddb. yeah, i guess 2 additional columns, some `in_reply_to` and `root`? then sounds like it could work with a single table. one other benefit of indexeddb imho is it works in web workers - related to push notifications in https://git.qcode.ch/nostr/nostrweb/issues/25. of course, indexeddb doesn't seem as easy as localstorage, judging by an example in https://github.com/mdn/dom-examples/blob/main/to-do-notifications/scripts/todo.js (from https://developer.mozilla.org/en-US/docs/Web/API/IndexedDB_API)
x1ddos commented 2 years ago
Review

but even with indexeddb... suppose you store all events in an events table. then how would it work to count duplicate messages. i imaging something like this:

select count(*) from events where content = <some-content>

not sure if it'll actually help because:

  • content can be quite large
  • you wouldn't want to index on content column which means the sql query above will be slow, maybe even slower that looping over the textNoteList array

wouldn't it make more sense to compare some hash(content)?

but even with indexeddb... suppose you store all events in an `events` table. then how would it work to count duplicate messages. i imaging something like this: ```sql select count(*) from events where content = <some-content> ``` not sure if it'll actually help because: - content can be quite large - you wouldn't want to index on content column which means the sql query above will be slow, maybe even slower that looping over the `textNoteList` array wouldn't it make more sense to compare some `hash(content)`?
offbyn commented 2 years ago
Review

makes sense, since nostr-tools is already using noble hases anyway

is sha256 overkill?

import { sha256 } from '@noble/hashes/sha256';
console.log(sha256('abc'))); 
├─┬ nostr-tools@0.24.1
│ ├── @noble/hashes@0.5.

https://www.npmjs.com/package/@noble/hashes

I'll look also in cleaning up whitespaces before hashing.

makes sense, since nostr-tools is already using noble hases anyway is sha256 overkill? ```js import { sha256 } from '@noble/hashes/sha256'; console.log(sha256('abc'))); ``` ``` ├─┬ nostr-tools@0.24.1 │ ├── @noble/hashes@0.5. ``` https://www.npmjs.com/package/@noble/hashes I'll look also in cleaning up whitespaces before hashing.
x1ddos commented 2 years ago
Review

here's an alternative. haven't measured but i think it would be the fastest:

const dups = {};

function countEventDup(content) {
  const key = hash(content);
  let elem = dups[key];
  if (!elem) {
    dups[key] = {n: 1, lastseen: Date.now()};
    return 1;
  }
  elem.n += 1;
  return elem.n;
}

function isValidNote(evt) {
  return countEventDup(evt.content) < 5;
}

however, even that doesn't work. i mean, it'll produce too many false negatives. imagine 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should. i still think #51 is a better way.

here's an alternative. haven't measured but i think it would be the fastest: ```js const dups = {}; function countEventDup(content) { const key = hash(content); let elem = dups[key]; if (!elem) { dups[key] = {n: 1, lastseen: Date.now()}; return 1; } elem.n += 1; return elem.n; } function isValidNote(evt) { return countEventDup(evt.content) < 5; } ``` however, even that doesn't work. i mean, it'll produce too many false negatives. imagine 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should. i still think https://git.qcode.ch/nostr/nostrweb/issues/51#issuecomment-519 is a better way.
x1ddos commented 2 years ago
Review

imagine 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should

actually, they don't have to be replies. just random people saying "hi" or "gm", on different relays. why should they suffer?

> imagine 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should actually, they don't have to be replies. just random people saying "hi" or "gm", on different relays. why should they suffer?
offbyn commented 2 years ago
Review

10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should

true then it will drop the new ones, 5 is arbitrary ofcorse.

also after reloading the newest 5 will render and not show the first 5.

i still think #51 is a better way.

they are very different, it's not mutually exclusive or do you think conceptually it's better to have no options in the client itself?

> 10 people replying "hi" within a minute. first 5 pass, the rest won't be shown even though they should true then it will drop the new ones, 5 is arbitrary ofcorse. also after reloading the newest 5 will render and not show the first 5. > i still think #51 is a better way. they are very different, it's not mutually exclusive or do you think conceptually it's better to have no options in the client itself?
x1ddos commented 2 years ago
Review

also after reloading the newest 5 will render and not show the first 5.

i guess i'm hoping one day i won't have to reload the page :)

they are very different, it's not mutually exclusive or do you think conceptually it's better to have no options in the client itself?

but the same motivation, right?

i'm just thinking. imagine i'm connected to a relay and follow alice. waiting for her to say "hi". but there are other 100 people saying "hi" all the time. so, i'll never see the "hi" from alice.

with that scenario, i think there should be more in the algorithm. for example, count duplicates only from pubkeys not in the contacts list, so only the public feed. otherwise, the risk is high "important" messages are swallowed.

either way, let's add this. even temporary, maybe it'll help a bit.

> also after reloading the newest 5 will render and not show the first 5. i guess i'm hoping one day i won't have to reload the page :) > they are very different, it's not mutually exclusive or do you think conceptually it's better to have no options in the client itself? but the same motivation, right? i'm just thinking. imagine i'm connected to a relay and follow alice. waiting for her to say "hi". but there are other 100 people saying "hi" all the time. so, i'll never see the "hi" from alice. with that scenario, i think there should be more in the algorithm. for example, count duplicates only from pubkeys not in the contacts list, so only the public feed. otherwise, the risk is high "important" messages are swallowed. either way, let's add this. even temporary, maybe it'll help a bit.
offbyn commented 2 years ago
Review

i guess i'm hoping one day i won't have to reload the page :)

indeed I should keep that more in mind, it's shouldn't just explode after some days, currently memory prob just grows forever.

indexdb could drop old events maybe 🤔 , (array.unshift()).

> i guess i'm hoping one day i won't have to reload the page :) indeed I should keep that more in mind, it's shouldn't just explode after some days, currently memory prob just grows forever. indexdb could drop old events maybe 🤔 , (`array.unshift()`).
x1ddos commented 2 years ago
Review

drop old events

definitely. well, from the page memory. then i'd imagine in the future, i can scroll back and forth and the client requests older/newer events and re-render.

> drop old events definitely. well, from the page memory. then i'd imagine in the future, i can scroll back and forth and the client requests older/newer events and re-render.
offbyn commented 2 years ago
Review

n the future, i can scroll back and forth and the client requests older/newer events and re-render.

this is the way 👍

> n the future, i can scroll back and forth and the client requests older/newer events and re-render. this is the way 👍
...replyList.filter(({content}) => content === evt.content),
].filter(({created_at}) => ((created_at + 3600) > evt.created_at));
if (similarEvents?.length >= 5) {
console.info(`DROP event with content: "${evt.content.trim()}" already got ${similarEvents.length} similar events withing the last hour`, similarEvents, `\n dropped event id: ${evt.id}`);
return false;
}
}
return true;
}
function handleTextNote(evt, relay) {
if (!isValidNote(evt)) {
return;
}
if (eventRelayMap[evt.id]) {
eventRelayMap[evt.id] = [relay, ...(eventRelayMap[evt.id])];
} else {
@ -273,9 +300,6 @@ function handleTextNote(evt, relay) {
}
}
const replyList = [];
const reactionMap = {};
const getReactionList = (id) => {
return reactionMap[id]?.map(({content}) => content) || [];
};
@ -357,6 +381,7 @@ setInterval(() => {
}, 10000);
const getNoxyUrl = (type, url, id, relay) => {
return false;
if (!isHttpUrl(url)) {
return false;
}
@ -371,6 +396,9 @@ const fetchQue = [];
let fetchPending;
const fetchNext = (href, id, relay) => {
const noxy = getNoxyUrl('meta', href, id, relay);
if (!noxy) {
return;
}
const previewId = noxy.searchParams.toString();
if (fetchPending) {
fetchQue.push({href, id, relay});

@ -45,7 +45,7 @@ input[type="radio"]:checked + label {
.tab-content {
max-width: 96ch;
min-height: 200px;
padding: calc(.5 * var(--gap)) 0 100px 0;
padding: var(--gap-half) 0 100px 0;
}
.tabbed {
align-items: start;

Loading…
Cancel
Save