package noxy import ( "context" "encoding/json" "errors" "fmt" "io" "log" "net/http" "net/url" "path" "sort" "strings" "sync" "time" "github.com/dyatlov/go-opengraph/opengraph" nostr "github.com/nbd-wtf/go-nostr" xurls "mvdan.cc/xurls/v2" ) var ( ErrNotFound = errors.New("event or resource not found") ErrUnsupportedEventKind = errors.New("unsupported event kind") ErrUnsupportedMimeType = errors.New("unsupported link mime type") ErrUnsupportedRelay = errors.New("unsupported relay") ) // LinkMeta contains metadata info about a URL. // it is typically assembled from OGP (https://ogp.me) by Noxer.FetchLinkMeta. type LinkMeta struct { Type string // og:type Title string // og:title Description string // og:description ImageURLs []string // og:image:secure_url or og:image:url } // Noxer can proxy link preview info and data streams. // See FetchLinkMeta and StreamLinkData for details. // // while the only required field is Cache, a zero value of KnownRelays // makes Noxer refuse to proxy any URLs. type Noxer struct { // Cache is used to store both link preview meta info and // data streamed to clients. it must be non-nil for Noxer to be usable. Cache Cacher // Noxer refuses to work with web pages and data streams larger than this value. MaxFileSize int64 // defaults to 1Mb // how long to keep an open connection to a relay without any activity. // an activity is any cache-miss call to FetchLinkMeta or StreamLinkData. // connections to relays are used to verify whether a link is part of // an event contents. see aforementioned methods for more details. IdleRelayTimeout time.Duration // defaults to 1min // Noxer connects only to those relays hostnames of which are specified here. // in other words, slice elements are only hostname parts of relay URLs. // KnownRelays must be sorted in ascending order. KnownRelays []string // HTTPClient is used to make HTTP connections when fetching link preview // info and data streaming. when nil, http.DefaultClient is used. HTTPClient *http.Client // clients keeps track of nostr relay connections to clean them up // and remove idle after IdleRelayTimeout. clientsMu sync.Mutex clients map[string]*relayClient cleanupTimer *time.Timer // slurpers keep track of ongoing HTTP requests, both link preview // meta info and data streams. slurpersMu sync.Mutex slurpers map[string]chan struct{} } // relayClient wraps nostr.Relay with an additional timestamp // indicating last use of the relay to keep track of all active relay // connections and remove idle. // // lastUsed is updated every time Noxer.fetchNostrEvent is called. type relayClient struct { relay *nostr.Relay lastUsed time.Time } // FetchLinkMeta requests the web page at link URL, parses it as HTML and returns // metadata found in the contents. It refuses to parse remote responses with // content-type other than text/html. // // link URL must be found in content field of the nostr event posted to the // specified relay. FetchLinkMeta connects to the nostr relay at relayURL // and sends a filter'ed request with ids field set to eventID. // the received event contents are "grepped" for the value of link as is. // // relayURL's hostname must be an element of x.KnownRelays. // remote must respond with HTTP 200 OK to the link URL. // // successfully parsed link URLs are cached using Cacher.PutJSON. so, subsequent // calls should not hit the remote server again unless x.Cache fails. // concurrent requests are suspended until the context or first call is done. func (x *Noxer) FetchLinkMeta(ctx context.Context, eventID, relayURL, link string) (*LinkMeta, error) { if err := x.verifyEventLink(ctx, eventID, relayURL, link, verifyNoMeta); err != nil { return nil, fmt.Errorf("verifyEventLink: %w", err) } return x.slurpLinkMeta(ctx, link) } func (x *Noxer) slurpLinkMeta(ctx context.Context, link string) (*LinkMeta, error) { // use cache here instead of directly in FetchLinkMeta to avoid // hitting remotes in x.verifyEventLink as much as possible. cacheKey := MakeCacheKey(link, CacheKeyURLPreview) var meta LinkMeta cacheErr := x.Cache.GetJSON(ctx, cacheKey, &meta) if cacheErr == nil { return &meta, nil } log.Printf("cache.getjson %s(%s): %v", link, cacheKey, cacheErr) ds, err := x.detachedSlurpData(ctx, link) if err != nil { return nil, fmt.Errorf("detachedSlurpData: %w", err) } defer ds.Close() if mtype := ds.MimeType(); mtype != "text/html" { return nil, fmt.Errorf("%w: received %q, want text/html", ErrUnsupportedMimeType, mtype) } res, err := parseLinkMeta(ds) if err != nil { return nil, fmt.Errorf("parseLinkMeta: %w", err) } if err := x.Cache.PutJSON(ctx, cacheKey, res); err != nil { log.Printf("cache.putjson %s(%s): %v", link, cacheKey, err) } return res, nil } // StreamLinkData opens an HTTP connection to link and streams the response back. // while doing so, it also caches the reponse bytes using Cache.PutStream. so, // subsequent calls should not hit the remote link again unless x.Cache fails. // // link URL must be found in "content" field of the nostr event posted to the // specified relay. StreamLinkData connects to the nostr relay at relayURL // and sends a filter'ed request with ids field set to eventID. // for event kinds 1 (text note) and 42 (channel message), the event contents // are simply "grepped" for the value of link as is. // for event kinds 0 (set metadata), 40 (create channel) and 41 (set channel // metadata) the link is checked against "picture" field. // // additionally, link URL may be one of LinkMeta.ImageURLs as returned by // x.FetchLinkMeta to a call with the same eventID. // // relayURL's hostname must be an element of x.KnownRelays. // remote must respond with HTTP 200 OK to the link URL. // // callers must close DataStream. // concurrent requests are suspended until the context or first call is done. func (x *Noxer) StreamLinkData(ctx context.Context, eventID, relayURL, link string) (*DataStream, error) { if err := x.verifyEventLink(ctx, eventID, relayURL, link, verifyExpandMeta); err != nil { return nil, err } cacheKey := MakeCacheKey(link, CacheKeyData) ds, err := x.Cache.GetStream(ctx, cacheKey) if err != nil { log.Printf("cache.getstream %s(%s): %v", link, cacheKey, err) ds, err = x.detachedSlurpData(ctx, link) } return ds, err } // detachedSlurpData always finishes data streaming from remote url, event if // the returned DataStream is closed prematurely, to cache the bytes for subsequent calls. func (x *Noxer) detachedSlurpData(ctx context.Context, url string) (*DataStream, error) { // check whether there's an ongoing stream. if so, wait and use cache or fail. cacheKey := MakeCacheKey(url, CacheKeyData) cacheKeyStr := cacheKey.Path() x.slurpersMu.Lock() slurpCh, found := x.slurpers[cacheKeyStr] if found { // a previous call is already streaming. // wait 'till they're done, because the stream is non-seekable, // then get it from cache or fail. x.slurpersMu.Unlock() select { case <-ctx.Done(): return nil, ctx.Err() case <-slurpCh: return x.Cache.GetStream(ctx, cacheKey) } } else { // wouldn't need this branch if close(slurpCh) was done after x.slurpersMu.Lock() // in the goroutine below. // but it's so easy to miss in future code changes that i don't want to risk it: // not a big deal to check the cache one more time. // reconsider if performance here becomes a concern. ds, err := x.Cache.GetStream(ctx, cacheKey) if err == nil { x.slurpersMu.Unlock() return ds, nil } } // no other goroutine is streaming; do it now and make others wait on slurpCh. slurpCh = x.makeSlurperChan(cacheKeyStr) x.slurpersMu.Unlock() // assuming 1min is enough to download a file. // this may be too short for large values of x.MaxFileSize. // TODO: compute ctx based on x.MaxFileSize? ctx, cancelHTTP := context.WithTimeout(context.Background(), time.Minute) req, err := http.NewRequestWithContext(ctx, "GET", url, nil) if err != nil { cancelHTTP() return nil, err } resp, err := x.httpClient().Do(req) if err != nil { cancelHTTP() return nil, err } if resp.StatusCode != http.StatusOK { cancelHTTP() if resp.StatusCode == http.StatusNotFound { return nil, ErrNotFound } return nil, fmt.Errorf("bad HTTP response %s: %s", url, resp.Status) } ctype := resp.Header.Get("Content-Type") if ctype == "" { // TODO: sniff using mime magic bytes? ctype = "application/octet-stream" } // rout is returned to the caller, wout is tee'ed from resp.Body. // if the caller closes rout, tee'ing to wout also stops. rout, wout := io.Pipe() go func() { defer func() { resp.Body.Close() wout.Close() cancelHTTP() close(slurpCh) x.slurpersMu.Lock() delete(x.slurpers, cacheKeyStr) x.slurpersMu.Unlock() }() // the std io.TeeReader wouldn't work since it reports errors on reads // from tee as soon as writes to wout fail which is the case if the caller // closes rout. tee := SinkTeeReader(HardLimitReader(resp.Body, x.maxFileSize()), wout) if err := x.Cache.PutStream(ctx, cacheKey, ctype, tee); err != nil { log.Printf("cache.putstream %s: %v", cacheKey, err) // TODO: don't close; io.copy(wout, resp.body) here on cache failures? } }() return &DataStream{ContentType: ctype, r: rout}, nil } // expandMeta arg values for verifyEventLink const ( verifyExpandMeta = true verifyNoMeta = false ) // verifyEventLink checks whether link URL is in a nostr event's content, // or one of OGP link preview URLs if expandMeta is true. func (x *Noxer) verifyEventLink(ctx context.Context, eventID, relayURL, link string, expandMeta bool) error { if !x.whitelistedRelay(relayURL) { return ErrUnsupportedRelay } eventURLs, err := x.fetchEventURLs(ctx, eventID, relayURL) if err != nil { return err } log.Printf("fetched event URLs: %q", eventURLs) for _, u := range eventURLs { if u == link { return nil } } if !expandMeta { return ErrNotFound } // link not found in the event text/json. // check URLs in OGP metadata for each suitable link found in the event. for _, urlInEvent := range eventURLs { // try only cache first. a client may have already requested /meta // with this URL. if so, need no further parsing and network roundtrips. var cachedMeta LinkMeta if x.Cache.GetJSON(ctx, MakeCacheKey(urlInEvent, CacheKeyURLPreview), &cachedMeta) == nil { if nonSortedSliceContains(cachedMeta.ImageURLs, link) { return nil // ok; found } continue // move on to the next url in the event } // cached failed or miss; possibly fetch from remote and parse. if !looksLikeHTMLPage(urlInEvent) { continue } meta, err := x.slurpLinkMeta(ctx, urlInEvent) if err != nil { log.Printf("verifyEventLink slurpLinkMeta(%s): %v", urlInEvent, err) continue } if nonSortedSliceContains(meta.ImageURLs, link) { return nil // ok; found } } return ErrNotFound } // fetchEventURLs returns all URLs found in a nostr event. // it assumes the relay URL is already checked to match x.KnownRelays. func (x *Noxer) fetchEventURLs(ctx context.Context, eventID, relayURL string) ([]string, error) { // check whether there's an ongoing fetch. if so, wait and use cache or fail. cacheKey := MakeCacheKey(eventID, CacheKeyEvent) cacheKeyStr := cacheKey.Path() x.slurpersMu.Lock() slurpCh, found := x.slurpers[cacheKeyStr] if found { // a previous call is already fetching. // wait 'till they're done, then get it from cache or fail. x.slurpersMu.Unlock() select { case <-ctx.Done(): return nil, ctx.Err() case <-slurpCh: var urls []string err := x.Cache.GetJSON(ctx, cacheKey, &urls) return urls, err } } else { // same reasoning as in detachedSlurpData. // wouldn't need this branch if close(slurpCh) was done after x.slurpersMu.Lock() // in the goroutine below. but it's too easy to miss in future code changes. // checking cache one more time here is most likely insignificant when compared to // opening a websocket to a nostr relay. var urls []string if err := x.Cache.GetJSON(ctx, cacheKey, &urls); err == nil { x.slurpersMu.Unlock() return urls, nil } } // no other goroutine is fetching; do it now and make others wait on slurpCh. slurpCh = x.makeSlurperChan(cacheKeyStr) x.slurpersMu.Unlock() defer func() { close(slurpCh) x.slurpersMu.Lock() delete(x.slurpers, cacheKeyStr) x.slurpersMu.Unlock() }() event, err := x.fetchNostrEvent(ctx, eventID, relayURL) if err != nil { return nil, err } var eventURLs []string switch event.Kind { default: return nil, ErrUnsupportedEventKind case nostr.KindTextNote, nostr.KindChannelMessage: eventURLs = extractAcceptableURLs(event.Content) case nostr.KindSetMetadata, nostr.KindChannelCreation, nostr.KindChannelMetadata: var p struct{ Picture string } if err := json.Unmarshal([]byte(event.Content), &p); err != nil { return nil, err } if validURL(p.Picture) { eventURLs = append(eventURLs, p.Picture) } } if err := x.Cache.PutJSON(ctx, cacheKey, eventURLs); err != nil { log.Printf("cache.putjson %s: %v", cacheKey, err) } return eventURLs, nil } // assuming relay is whitelisted func (x *Noxer) fetchNostrEvent(ctx context.Context, eventID, relayURL string) (*nostr.Event, error) { relay, err := x.relayConn(ctx, relayURL) if err != nil { return nil, err } var ( event *nostr.Event fetchErr error ) // assuming 10sec is more than enough for a simple filter'ed sub with a single // event ID. ctx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() done := make(chan struct{}) go func() { defer close(done) f := nostr.Filter{IDs: []string{eventID}, Limit: 1} sub := relay.Subscribe(nostr.Filters{f}) defer sub.Unsub() select { case e := <-sub.Events: // e.CheckSignature() is already done by the client event = &e case <-ctx.Done(): fetchErr = ctx.Err() } }() select { case <-done: return event, fetchErr case <-ctx.Done(): return nil, ctx.Err() } } // connect to a nostr relay at relayURL or reuse an existing conn. // it blocks all other callers. func (x *Noxer) relayConn(ctx context.Context, relayURL string) (*nostr.Relay, error) { // check existing conn and reuse if found relayURL = nostr.NormalizeURL(relayURL) x.clientsMu.Lock() defer x.clientsMu.Unlock() if cl, ok := x.clients[relayURL]; ok { // "touch" the last used to let cleanup timer know we aren't idling cl.lastUsed = time.Now() return cl.relay, nil } // none found. make a new conn. var ( relay *nostr.Relay connErr error ) // assuming 10sec is more than enough to connect to a websocket. connCtx, cancel := context.WithTimeout(ctx, 10*time.Second) defer cancel() done := make(chan struct{}) go func() { // TODO: send a patch upstream for a nostr.RelayConnectContext(ctx, url) relay, connErr = nostr.RelayConnect(relayURL) close(done) }() select { case <-connCtx.Done(): // unfortunately, this leaves the above goroutine hanging, and will keep // piling up for non-responsive relays. // can be solved with a nostr.RelayConnectContext. return nil, connCtx.Err() case <-done: if connErr != nil { return nil, connErr } } if x.clients == nil { x.clients = make(map[string]*relayClient) } x.clients[relayURL] = &relayClient{ relay: relay, lastUsed: time.Now(), } // a self-cleanup goroutine to delete ourselves if relay reports conn errors. go func() { err := <-relay.ConnectionError log.Printf("%s: closing due to: %v", relayURL, err) x.clientsMu.Lock() defer x.clientsMu.Unlock() relay.Close() delete(x.clients, relayURL) }() if x.cleanupTimer == nil { x.cleanupTimer = time.AfterFunc(x.idleRelayTimeout(), x.cleanupRelayConn) } return relay, nil } // close and delete nostr relay connections idling for more than x.idleRelayTimeout(). func (x *Noxer) cleanupRelayConn() { x.clientsMu.Lock() defer x.clientsMu.Unlock() for url, cl := range x.clients { if time.Since(cl.lastUsed) > x.idleRelayTimeout() { log.Printf("closing idle conn to %s", url) cl.relay.Close() delete(x.clients, url) } } if len(x.clients) > 0 { x.cleanupTimer = time.AfterFunc(time.Minute, x.cleanupRelayConn) } else { x.cleanupTimer = nil } } // assumes x.slurpersMu is handled by the caller. func (x *Noxer) makeSlurperChan(k string) chan struct{} { if x.slurpers == nil { x.slurpers = make(map[string]chan struct{}) } ch := make(chan struct{}) x.slurpers[k] = ch return ch } func (x *Noxer) httpClient() *http.Client { if x.HTTPClient == nil { return http.DefaultClient } return x.HTTPClient } func (x *Noxer) idleRelayTimeout() time.Duration { if x.IdleRelayTimeout == 0 { return time.Minute } return x.IdleRelayTimeout } func (x *Noxer) maxFileSize() int64 { if x.MaxFileSize == 0 { return 1 << 20 // 1Mb } return x.MaxFileSize } // whitelistedRelay reports whether a nostr relay at urlStr is in x.KnownRelays. // it expects x.KnownRelays to be sorted in lexical order. // // only hostname of urlStr is checked against x.KnownRelays. func (x *Noxer) whitelistedRelay(urlStr string) bool { u, err := url.Parse(urlStr) if err != nil { return false } host := u.Hostname() i := sort.SearchStrings(x.KnownRelays, host) return i < len(x.KnownRelays) && x.KnownRelays[i] == host } // TODO: use oEmbed if OGP fails? func parseLinkMeta(r io.Reader) (*LinkMeta, error) { og := opengraph.NewOpenGraph() if err := og.ProcessHTML(r); err != nil { return nil, err } if len(og.Images) == 0 { return nil, ErrNotFound } meta := &LinkMeta{ Type: og.Type, Title: og.Title, Description: og.Description, ImageURLs: make([]string, 0, len(og.Images)), } for _, img := range og.Images { u := img.SecureURL if u == "" { u = img.URL } if u == "" { continue } meta.ImageURLs = append(meta.ImageURLs, u) } return meta, nil } // TODO: patch to extract only host/ip; no emails and such var urlRegexp = xurls.Relaxed() func extractAcceptableURLs(text string) []string { var urls []string for _, a := range urlRegexp.FindAllString(text, -1) { if validURL(a) { urls = append(urls, a) } } return urls } func validURL(urlStr string) bool { if urlStr == "" { return false } u, err := url.Parse(urlStr) if err != nil { return false } if u.Hostname() == "" { return false } return u.Scheme == "" || u.Scheme == "http" || u.Scheme == "https" } // O(N) lookup of elem in a func nonSortedSliceContains(a []string, elem string) bool { for _, v := range a { if v == elem { return true } } return false } // reports whether urlStr looks like a URL of an html webpage. func looksLikeHTMLPage(urlStr string) bool { u, err := url.Parse(urlStr) if err != nil { return false } ext := path.Ext(u.Path) // any .xxxhtml is ok return ext == "" || strings.HasSuffix(ext, "html") || strings.HasSuffix(ext, "htm") }