From 4ef83c4a1142d409d917fc19adeb4d5669446aa7 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Thu, 5 Feb 2026 17:28:27 +0100 Subject: [PATCH 1/9] overhaul instagram to include reels --- modules/instagram.js | 476 +++++++++++++++++++++++++++++++------------ 1 file changed, 349 insertions(+), 127 deletions(-) diff --git a/modules/instagram.js b/modules/instagram.js index a1a9313..693a69d 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -1,5 +1,89 @@ +// Helper to extract embedded Instagram JSON data from HTML responses +function extractEmbeddedInstagramJSON(response) { + const datas = []; + + let js_prefixes = [ + "{\"require\":[[\"ScheduledServerJS\",\"handle\",null,[{\"__bbox\":{\"require\":[[\"RelayPrefetchedStreamCache\",\"next\",[],[", + // Explorer embedded JSON has a different prefix + "{\"require\":[[\"ScheduledServerJS\",\"handle\",null,[{\"__bbox\":{\"require\":[[\"PolarisQueryPreloaderCache\",\"add\",[],[" + ]; + + let prefix; + while (js_prefixes.length > 0) { + prefix = js_prefixes.shift(); + + // we go through the response line by line, because prefixes may + // occur multiple times but always on a single line + for (const line of response.split("\n")) { + if (line.indexOf(prefix) === -1) { + // prefix not found + continue; + } + + let json_bit = line.split(prefix.slice(0, -1))[1].split('')[0].trim(); + if (json_bit.endsWith(';')) { + json_bit = json_bit.substring(0, json_bit.length - 1); + } + + if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGrid') >= 0) { + // 'related posts', this is never what we are looking for + continue; + } + + if (prefix.indexOf("additionalDataLoaded") !== -1) { + // remove trailing ) + json_bit = json_bit.slice(0, -1); + } else if (js_prefixes.length === 0) { + // last prefix has some special handling in upstream code + json_bit = json_bit.split(']]}}')[0]; + } + + json_bit = json_bit.split('],["CometResourceScheduler"')[0]; + + try { + extracted_json = JSON.parse(json_bit); + // Explorer embedded JSON is wrapped differently + function _traverse_parsed_json(obj) { + for (let property in obj) { + if (!obj.hasOwnProperty(property)) { + // not actually a property + continue; + } + if (property === "result" && "response" in obj[property]) { + try { + return JSON.parse(obj[property]["response"]); + } catch (e) { + console.log('Instagram JSON parse error in explorer wrapper'); + console.log(obj[property]["response"]); + } + } else if (typeof (obj[property]) === "object") { + const res = _traverse_parsed_json(obj[property]); + if (res !== null) return res; + } else { + // not an object, can't contain the explorer JSON + continue; + } + } + return null; + } + const explorer_json = _traverse_parsed_json(extracted_json); + if (explorer_json !== null) { + datas.push(explorer_json); + } else { + datas.push(JSON.parse(json_bit)); + } + } catch (e) { + console.log('Instagram JSON parse error'); + console.log(json_bit); + } + } + } + + return datas; +}; + zeeschuimer.register_module( - 'Instagram (posts)', + 'Instagram (posts & reels)', 'instagram.com', function (response, source_platform_url, source_url) { let domain = source_platform_url.split("/")[2].toLowerCase().replace(/^www\./, ''); @@ -7,36 +91,103 @@ zeeschuimer.register_module( if (!["instagram.com"].includes(domain)) { return []; } + + const debug_logs = true; // determine what part of instagram we're working in - // 'view' unused for now but may have some bearing on how to parse the data - // in any case - let path = source_platform_url.split('?')[0].replace(/\/$/, '').split("/"); + const path = new URL(source_platform_url).pathname.split('/').filter(Boolean); + const source_url_path = new URL(source_url).pathname.split('/').filter(Boolean); let view = ""; - if (path.length === 3) { - // www.instagram.com, no sub URL + if (["logging_client_events"].includes(source_url_path[0])) { + // background requests for logging + if (debug_logs) console.log('ignoring background request ' + source_url); + return []; + } else if (path.length === 0) { + /// www.instagram.com, no sub URL view = "frontpage"; - } else if (["direct", "account", "directory", "lite", "legal"].includes(path[3])) { + } else if (["direct", "account", "directory", "lite", "legal" ].includes(path[0])) { // not post listings but misc instagram views/pages - // console.log('ignoring misc url ' + source_url); + // direct = messages; could be interesting in some cases (e.g. researcher sending themselves specific reels) + if (debug_logs) console.log('ignoring misc url ' + source_url); + return []; + } else if (["static_resources"].includes(path[0])) { + // static resources (e.g. for the web interface), not actual content + if (debug_logs) console.log('ignoring static resource ' + source_url); return []; } else if (source_url.indexOf('injected_story_units') >= 0) { // injected ads (this URL appears on many ad blocklists!) // might enable if we decide to also capture ads? but not clear where these actually show up in the // interface... - // console.log('ignoring ads from ' + source_url); + if (debug_logs) console.log('ignoring ads from ' + source_url); return []; - } else if (path[3] === "explore") { - if(path[4] === "locations") { + } else if (path[0] === "explore") { + view = "explore"; + if (path[1] === "locations") { view = "location"; - } else { + } else if (path[1] === "search") { // hashtag, location view view = "search"; } - } else { - // user pages or similar - view = "user"; + } else if (path[0] === "reels") { + // reels explore page + view = "reels"; + // NOTE: https://www.instagram/reels routes to a random reel (e.g. https://www.instagram.com/reels/DR1BchxgSxf/) + // This makes it difficult to distinguish between single reel pages and the reels explore page (though single reel pages are captured below) + if (path.length > 1 && path[1] === "audio") { + // reels shared audio page + view = "reels_audio"; + } + } else if (path[0] === "stories") { + if (path.length > 1 && path[1] === "highlights") { + // User highlight reels + // e.g. https://www.instagram.com/stories/highlights/numeric_highlight_code + view = "highlight_reels"; + // These objects are NOT complete reels; the "code" field is misleading (would connect to a different reel if used in URL) + // Can find real reel code in "store_feed_media" under "media_code", but the object is missing virtually everything else + // skipping as of 2026-jan-29; would need special handling + } else { + // 2026-2-5 stories explore page (https://www.instagram.com/stories/) does not exist + return []; + } + } else if (path[0] === "reel") { + // single reel + view = "single_reel"; + } else if (path[0] === "p") { + // single post page + view = "single_post"; + + // Assuming we caught everything else above, these should be user pages + } else if (path.length == 1) { + // user profile page w/ posts + // e.g. https://www.instagram.com/username/ + view = "user_posts"; + } else if (path.length > 1) { + // Additional user pages with different content (e.g. tagged posts, reels, etc.) + if (path[1] === "tagged") { + // user tagged posts page (e.g. https://www.instagram.com/user/tagged/) + view = "user_tagged"; + } else if (path[1] === "reels") { + // user reels page (e.g. https://www.instagram.com/username/reels/) + view = "user_reels"; + } else if (path[1] === "reposts") { + // user reposts page (e.g. https://www.instagram.com/username/reposts/) + view = "user_reposts"; + } else if (path[1] === "saved") { + // user saved posts page (e.g. https://www.instagram.com/username/saved/) + view = "user_saved"; + } else if (path[1] === "p") { + // single post page with extra path element (e.g. https://www.instagram.com/username/p/postcode/) + view = "single_post"; + } else if (path[1] === "reel") { + // single reel page with extra path element (e.g. https://www.instagram.com/username/reel/reelcode/) + view = "single_reel"; + } else { + // some other page; may not be user (path[0] could be new content type) + view = "unknown"; + console.log('Unknown page type', path, 'for url', source_platform_url); + } } + // console.log(view + ' view for ' + source_platform_url + ' from ' + source_url); // instagram sometimes loads content in the background without actually using it // maybe pre-caching it or something? @@ -49,81 +200,55 @@ zeeschuimer.register_module( // reels audio page f.ex. loads personalised reels in the background (unrelated to the audio) but doesn't // seem to actually use them) - // console.log('ignoring pre-cache ' + source_url); + if (debug_logs) console.log('ignoring pre-cache ' + source_url); return []; } + if (source_url.indexOf("/api/v1/discover/web/explore_grid/") >= 0) { + // Preload explorer content + // Not used on search or location explorer pages + // ✔️ confirmed working as of 2026-2-5 + if (view !== "explore") { + if (debug_logs) console.log('ignoring pre-cache ' + source_url); + return []; + } + } + let datas = []; try { + // some responses have this prefix that needs to be removed before parsing + // e.g. /api/vi1/clips/music/... + if (response.startsWith("for (;;);")) { + response = response.slice("for (;;);".length); + } // if it's JSON already, just parse it datas.push(JSON.parse(response)); } catch { // data can be embedded in the HTML in these JavaScript statements - // this is mostly used for: // - single post pages (e.g. https://www.instagram.com/p/C1hWCZLPQ9T/) - // ✔️ confirmed working as of 2024-aug-21 - - let js_prefixes = [ - "{\"require\":[[\"ScheduledServerJS\",\"handle\",null,[{\"__bbox\":{\"require\":[[\"RelayPrefetchedStreamCache\",\"next\",[],[" - ]; - - let prefix; - while (js_prefixes.length > 0) { - prefix = js_prefixes.shift(); - - // we go through the response line by line, because prefixes may - // occur multiple times but always on a single line - for (const line of response.split("\n")) { - if (line.indexOf(prefix) === -1) { - // prefix not found - continue; - } - - let json_bit = line.split(prefix.slice(0, -1))[1].split('')[0].trim(); - if (json_bit.endsWith(';')) { - json_bit = json_bit.substring(0, -1); - } - - if (json_bit.indexOf('adp_PolarisDesktopPostPageRelatedMediaGrid') >= 0) { - // 'related posts', this is never what we are looking for - continue; - } - - if (prefix.indexOf("additionalDataLoaded") !== -1) { - // remove trailing ) - json_bit = json_bit.slice(0, -1); - } else if (js_prefixes.length === 0) { - // last prefix has some special handling - // remove trailing stuff... - json_bit = json_bit.split(']]}}')[0]; - } - - json_bit = json_bit.split('],["CometResourceScheduler"')[0]; - + // - single reel pages (e.g. https://www.instagram.com/reel/C1hWCZLPQ9T/) + // ✔️ confirmed working as of 2026-2-5 - try { - datas.push(JSON.parse(json_bit)); - } catch { - console.log('bad json'); - console.log(json_bit); - // fine, not JSON after all - } - } - } - - if (datas.length === 0) { - // console.log('no datas for ' + source_url); + // Extract any embedded JSON fragments using shared helper + try { + datas.push(...(extractEmbeddedInstagramJSON(response) || [])); + } catch (e) { + // ignore + console.log(e); return []; } } - if (datas.length === 1 && 'lightspeed_web_request_for_igd' in datas[0] && source_url.endsWith('graphql')) { + if (datas.length === 0) { + // console.log('no datas for ' + source_url); + return []; + } else if (datas.length === 1 && 'lightspeed_web_request_for_igd' in datas[0] && source_url.endsWith('graphql')) { // this is one of those background requests // console.log('ignoring background request ' + source_url); datas = []; } - let possible_item_lists = ["medias", "feed_items", "fill_items", "two_by_two_item"]; + let possible_item_lists = ["items", "edges", "repost_grid_items", "medias", "feed_items", "fill_items", "two_by_two_item"]; let edges = []; // find edge lists in the extracted JSON data @@ -136,49 +261,125 @@ zeeschuimer.register_module( // not actually a property continue; } + // Handle frontpage and filter our background requests for it + if (property === "xdt_api__v1__feed__timeline__connection") { + if (view === "frontpage") { + // - posts in personal feed without adds (i.e. https://instagram.com) + // ✔️ confirmed working 2026-feb-5 + if (debug_logs) console.log('processing timeline edges from ' + source_url); + edges.push(...obj[property]["edges"].filter(edge => "node" in edge).map(edge => edge["node"]).map(edge => { + // this ensures suggested posts are also included + if(edge['media'] === null && edge['explore_story'] && edge['explore_story']['media']) { + return edge['explore_story']; + } else { + return edge; + } + }).filter(node => { + return "media" in node + && node["media"] !== null + && "id" in node["media"] + && "user" in node["media"] + && !!node["media"]["user"] + }).map(node => node["media"])); + return; + } else { + // this is a background request for the personal feed + if (debug_logs) console.log('ignoring background feed request ' + source_platform_url); + return; + } - // pages not covered: - // - explore (e.g. https://www.instagram.com/explore/) - // ❌ as of 2024-aug-21 - // - 'tagged' pages for a user (e.g. https://www.instagram.com/steveo/tagged/) - // ❌ as of 2024-aug-21 - // - 'reels' user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/reels/) - // ❌ as of 2024-aug-21 - // these do not load enough post metadata (e.g. author or caption), so too different from other items - // to parse - // - suggested posts on user feed - // these could easily be included... may add in the future - - if (possible_item_lists.includes(property) || property === "items") { - // - posts on explore pages for specific tags (e.g. https://www.instagram.com/explore/tags/blessed/) - // - posts on explore pages for locations (e.g. https://www.instagram.com/explore/locations/238875664/switzerland/) - // ✔️ confirmed working as of 2024-aug-21 - // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/) - // ✔️ confirmed working as of 2024-aug-21 - // - posts when opened by clicking on them - // ✔️ confirmed working as of 2024-aug-21 + // Handle most other pages with generic item list parsing to verify post/reels + } else if (possible_item_lists.includes(property)) { + if (Array.isArray(obj[property]) && obj[property].length === 0) { + // empty list + continue; + } let items; - if (property === "medias" || property === "fill_items") { - items = obj[property].map(media => media["media"]); + if (property === "edges" || property === "repost_grid_items") { + // `edges` lists have [{node: {media: {...}}}, ...] format + // or [{node: {...}}, ...] format + + // Normalize edges to node objects + const nodes = (obj[property] || []).map(entry => entry && (entry.node || entry)); + + // If nodes contain a `media` property, extract those media objects + const hasOwn = Object.prototype.hasOwnProperty; + const medias = nodes + .map(n => n && hasOwn.call(n, 'media') && n.media ? n.media : null) + .filter(Boolean); + + if (medias.length > 0) { + if (debug_logs) console.log('processing edges list (w/ media) from ' + source_url); + // Reels page e.g. https://www.instagram.com/reels/ + // ✔️ confirmed working 2026-feb-5 + // User reels page e.g. https://www.instagram.com/joshokeefeofficial/reels/ + // ✔️ confirmed working 2026-feb-5 + // Single reel page e.g. https://www.instagram.com/reels/DOgYE8ZjSBH/ + // ✔️ confirmed working 2026-feb-5 + // Single post page e.g. https://www.instagram.com/p/DUWPxaxD5BU/ + // ✔️ confirmed working 2026-feb-5 + // User reposts page e.g. https://www.instagram.com/username/reposts/ + // ✔️ confirmed working 2026-feb-5 + items = medias; + } else { + // If nodes themselves look like media objects (have id & media_type), use them + const nodeMediaLike = nodes.filter(n => n && 'id' in n && 'media_type' in n); + if (nodeMediaLike.length > 0) { + if (debug_logs) console.log('processing edges list (w/o media) from ' + source_url); + items = nodeMediaLike; + // Tagged posts page e.g. https://www.instagram.com/steveo/tagged/ + // ✔️ confirmed working 2026-feb-5 + } else { + // fallback to original property + items = obj[property]; + } + } + + + + + + } else if (property === "medias" || property === "fill_items") { + // Can be background loaded on various pages + if (["explore", "search"].includes(view)) { + // - posts on explore pages for specific tags (e.g. https://www.instagram.com/explore) + // ✔️ confirmed working as of 2026-feb-5 + // - posts on explore pages for specific tags (e.g. https://www.instagram.com/explore/tags/blessed/) + // ✔️ confirmed working as of 2026-feb-5 + if (debug_logs) console.log('processing medias/fill_items list from ' + source_url); + items = obj[property].map(media => media["media"]); + } else { + if (debug_logs) console.log('ignoring background medias/fill_items from ' + source_url); + continue; + } } else if (property === "feed_items") { + if (debug_logs) console.log('processing feed_items list from ' + source_url); items = obj[property].map(media => media["media_or_ad"]); - } else if (property === "items" && obj[property].length === obj[property].filter(i => Object.getOwnPropertyNames(i).join('') === 'media').length) { - // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/) - // ✔️ confirmed working as of 2024-aug-21 - if (property === 'items' && 'design' in obj) { - // this is loaded, but never actually displayed... - // seems to be a preview of reels for a given tag, but again, not - // actually visible in the interface afaics + } else if (property === "items" && obj[property].length === obj[property].filter(i => 'media' in i).length) { + + if (view === "explore" || ['api/v1/clips/music/', "api/v1/feed/saved/"].some(endpoint => source_url.indexOf(endpoint) >= 0)) { + // - posts on explore pages for sounds (e.g. https://www.instagram.com/reels/audio/290315579897542/) + // ✔️ confirmed working as of 2026-feb-5 + // User saved posts page (e.g. https://www.instagram.com/username/saved/) + // ✔️ confirmed working as of 2026-feb-5 + // Explore page reels are loaded here + // ✔️ confirmed working as of 2026-feb-5 + // Note: this loads reels via explorer, but can load both posts and reels e.g. in saved posts + if (debug_logs) console.log('processing explore items list with media property from ' + source_url); + items = obj[property].map(media => media["media"]); + } else { + if (debug_logs) console.log('ignoring background items with media property from ' + source_url); continue; - } - items = obj[property].filter(node => "media" in node).map(node => node["media"]).filter(node => { - return "id" in node - }); + } } else if (property === "two_by_two_item") { + if (debug_logs) console.log('processing two_by_two_item list from ' + source_url); // highlighted (4x size) items on e.g. tag overview page items = [obj[property]['channel']['media']] } else { + // Single reel popup e.g. https://www.instagram.com/reel/CsBfqYvuMg0/ + // ✔️ confirmed working 2026-feb-5 + if (debug_logs) console.log('processing generic items list from ' + source_url); items = obj[property]; } @@ -191,35 +392,23 @@ zeeschuimer.register_module( && "id" in item && "media_type" in item && "user" in item - && "caption" in item - && (!("product_type" in item) || item["product_type"] !== "story") + // && "caption" in item (partial reels may not have captions) + // ensure post/reel is "seen" (if that info is available) + && ("is_seen" in item ? item["is_seen"] !== false : true) // these next two are ads, which are not actually shown in the feed but still loaded in the // background - && (!("product type" in item) || item["product_type"] !== "ad") - && (!("link" in item) || !item["link"] || !item["link"].startsWith('https://www.facebook.com/ads/')) + && !("product_type" in item && item["product_type"] === "ad") + && !("link" in item && item["link"] && item["link"].startsWith('https://www.facebook.com/ads/')) ); })); } - } else if (!['user', 'location', 'search'].includes(view) && ["xdt_api__v1__feed__timeline__connection"].includes(property)) { - // - posts in personal feed *that are followed* (i.e. not suggested; e.g. https://instagram.com) - // ✔️ confirmed working 2024-feb-20 - edges.push(...obj[property]["edges"].filter(edge => "node" in edge).map(edge => edge["node"]).map(edge => { - // this ensures suggested posts are also included - if(edge['media'] === null && edge['explore_story'] && edge['explore_story']['media']) { - return edge['explore_story']; - } else { - return edge; - } - }).filter(node => { - return "media" in node - && node["media"] !== null - && "id" in node["media"] - && "user" in node["media"] - && !!node["media"]["user"] - }).map(node => node["media"])); + } else if (["xdt_api__v1__feed__user_timeline_graphql_connection", "xdt_location_get_web_info_tab"].includes(property)) { // - posts on user pages (e.g. https://www.instagram.com/ogata.yoshiyuki/) - // ✔️ confirmed working as of 2024-aug-21 + // ✔️ confirmed working as of 2026-feb-5 + // - posts on explore pages for locations (e.g. https://www.instagram.com/explore/locations/238875664/switzerland/) + // ✔️ confirmed working as of 2026-feb-5 + if (debug_logs) console.log('processing user timeline edges from ' + source_url); edges.push(...obj[property]["edges"].filter(edge => "node" in edge).map(edge => edge["node"]).filter(node => { return node !== null && "id" in node @@ -244,8 +433,41 @@ zeeschuimer.register_module( } } - // console.log('got ' + edges.length + ' via ' + source_url) + if (edges.length === 0) { + // console.log('no edges for ' + source_url); + // console.log(datas); + return []; + } + + let partial_count = 0; + // Add custom fields + const enriched = edges.map(edge => { + // update `id` field if partial object (to allow full object if looked up later) + // mark partial objects (missing caption or video_versions) + if (debug_logs) console.log('processing post/reel id ' + edge.code); + + // These are partial reel objects from user/audio/and other pages + if (!('caption' in edge) || !('video_versions' in edge)) { + edge = Object.assign({}, edge, { + id: 'partial_' + edge.id.toString(), + _zs_partial: true + }); + partial_count++; + } else { + edge = Object.assign({}, edge, { + _zs_partial: false + }); + } + + edge = Object.assign({}, edge, { + _zs_instagram_view: view, + }); + + return edge; + }); + + if (debug_logs) console.log(view + ' got ' + edges.length + ' (partial: ' + partial_count + ') via ' + source_url) // generic ad filter... - return edges.filter(edge => edge["product_type"] !== "ad"); + return enriched.filter(edge => edge["product_type"] !== "ad"); } ); \ No newline at end of file From 811435e24511696e63d09ac9d8f73405c8e68a88 Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Tue, 10 Feb 2026 11:40:55 +0100 Subject: [PATCH 2/9] add `decide_action` function to allow better duplicate item detection --- js/zs-background.js | 122 ++++++++++++++++++++++++++++++++++++++----- modules/instagram.js | 15 ++++-- 2 files changed, 120 insertions(+), 17 deletions(-) diff --git a/js/zs-background.js b/js/zs-background.js index 63c966b..7405fb2 100644 --- a/js/zs-background.js +++ b/js/zs-background.js @@ -18,18 +18,37 @@ window.zeeschuimer = { * @param callback Function to parse request content with, returning an Array of extracted items * @param module_id Module ID; if not given, use domain name as module ID. Use this if multiple modules read from * the same domain. + * @param decide_action Optional function to decide whether to insert/update/merge/skip items. */ - register_module: function (name, domain, callback, module_id=null) { + register_module: function (name, domain, callback, module_id=null, decide_action=null) { if(!module_id) { module_id = domain; } this.modules[module_id] = { name: name, domain: domain, - callback: callback + callback: callback, + decide_action: decide_action }; }, + /** + * Default decision logic for whether to store an item. + * + * Modules can override via `decide_action` on registration. + * Decisions: + * - 'insert': add a new row (even if another row already exists). + * - 'skip': do not write anything. + * - 'update': replace an existing row's data (keeps the original timestamp_collected). + * - 'merge': shallow-merge incoming item into existing row's data. + */ + default_decide_action: function (item, existing_item) { + if (existing_item) { + return 'skip'; + } + return 'insert'; + }, + /** * Initialise Zeeschuimer * Called on browser session start; increases session index to aid in deduplicating extracted items. @@ -175,20 +194,95 @@ window.zeeschuimer = { } let item_id = item["id"]; - let exists = await db.items.where({"item_id": item_id, "nav_index": nav_index}).first(); + if (item_id === undefined || item_id === null) { + console.warn('Item contained null item_id; skipping', item); + return; + } - if (!exists) { - await db.items.add({ - "nav_index": nav_index, + await db.transaction('rw', db.items, async () => { + const module = this.modules[module_id]; + let existing_item = await db.items.where({ "item_id": item_id, - "timestamp_collected": Date.now(), - "source_platform": module_id, - "source_platform_url": origin_url, - "source_url": document_url, - "user_agent": navigator.userAgent, - "data": item - }); - } + "nav_index": nav_index, + "source_platform": module_id + }).first(); + + let decision = null; + if (module && typeof module.decide_action === "function") { + decision = await module.decide_action(item, existing_item, nav_index); + } + + if (!decision) { + decision = this.default_decide_action(item, existing_item); + } + + let action = decision; + + if (typeof action === 'string') { + action = action.toLowerCase(); + } + + if (!['insert', 'skip', 'update', 'merge'].includes(action)) { + console.warn('Invalid decide_action result for module', module_id, action); + action = this.default_decide_action(item, existing_item).toLowerCase(); + } + + let target_item = existing_item; + + if (action === "skip") { + return; + } + + if (action === "insert" || !target_item) { + // Insert new item with incoming data + await db.items.add({ + "nav_index": nav_index, + "item_id": item_id, + "timestamp_collected": Date.now(), + "last_updated": Date.now(), + "source_platform": module_id, + "source_platform_url": origin_url, + "source_url": document_url, + "user_agent": navigator.userAgent, + "data": item + }); + return; + } + + if (action === "update") { + // Replace the stored data with the incoming item, keeping the original timestamp_collected. + await db.items.update(target_item.id, { + "nav_index": target_item.nav_index, + "item_id": item_id, + "timestamp_collected": target_item.timestamp_collected || Date.now(), + "last_updated": Date.now(), + "source_platform": module_id, + "source_platform_url": origin_url, + "source_url": document_url, + "user_agent": navigator.userAgent, + "data": item + }); + return; + } + + if (action === "merge") { + // Merge stored data with the incoming item (shallow merge). + const merged_data = Object.assign({}, target_item.data || {}, item); + + await db.items.update(target_item.id, { + "nav_index": target_item.nav_index, + "item_id": item_id, + "timestamp_collected": target_item.timestamp_collected || Date.now(), + "last_updated": Date.now(), + "source_platform": module_id, + "source_platform_url": origin_url, + "source_url": document_url, + "user_agent": navigator.userAgent, + "data": merged_data + }); + return; + } + }); })); return; diff --git a/modules/instagram.js b/modules/instagram.js index 693a69d..0c0e792 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -92,7 +92,7 @@ zeeschuimer.register_module( return []; } - const debug_logs = true; + const debug_logs = false; // determine what part of instagram we're working in const path = new URL(source_platform_url).pathname.split('/').filter(Boolean); @@ -442,14 +442,12 @@ zeeschuimer.register_module( let partial_count = 0; // Add custom fields const enriched = edges.map(edge => { - // update `id` field if partial object (to allow full object if looked up later) // mark partial objects (missing caption or video_versions) if (debug_logs) console.log('processing post/reel id ' + edge.code); // These are partial reel objects from user/audio/and other pages if (!('caption' in edge) || !('video_versions' in edge)) { edge = Object.assign({}, edge, { - id: 'partial_' + edge.id.toString(), _zs_partial: true }); partial_count++; @@ -469,5 +467,16 @@ zeeschuimer.register_module( if (debug_logs) console.log(view + ' got ' + edges.length + ' (partial: ' + partial_count + ') via ' + source_url) // generic ad filter... return enriched.filter(edge => edge["product_type"] !== "ad"); + }, + null, + function (item, existing_item, nav_index) { + // If the existing item is partial, create a new item instead of updating it. + if (existing_item && existing_item.data && existing_item.data._zs_partial === true) { + console.log('Existing item is partial, inserting new item instead of updating'); + return 'insert'; + } + + // Default behavior (item_id + nav_index) for all other cases. + return null; } ); \ No newline at end of file From f5f2700566da757698913c86193bd99f78ef936d Mon Sep 17 00:00:00 2001 From: Dale Wahl Date: Wed, 11 Feb 2026 16:09:41 +0100 Subject: [PATCH 3/9] add user duplicate behavior feature --- js/zs-background.js | 113 +++++++++++++++++++++++++++++-------------- modules/instagram.js | 27 ++++++++--- popup/interface.html | 16 ++++++ popup/interface.js | 28 +++++++++++ 4 files changed, 142 insertions(+), 42 deletions(-) diff --git a/js/zs-background.js b/js/zs-background.js index 7405fb2..6f3c323 100644 --- a/js/zs-background.js +++ b/js/zs-background.js @@ -18,9 +18,11 @@ window.zeeschuimer = { * @param callback Function to parse request content with, returning an Array of extracted items * @param module_id Module ID; if not given, use domain name as module ID. Use this if multiple modules read from * the same domain. - * @param decide_action Optional function to decide whether to insert/update/merge/skip items. + * @param overwrite_partial Optional function to determine if incoming item should replace existing item. + * Signature: (incoming_item, existing_item) => boolean. Returns true if incoming should + * replace existing, false otherwise. Backend routes to same-nav or any-nav based on availability. */ - register_module: function (name, domain, callback, module_id=null, decide_action=null) { + register_module: function (name, domain, callback, module_id=null, overwrite_partial=null) { if(!module_id) { module_id = domain; } @@ -28,27 +30,10 @@ window.zeeschuimer = { name: name, domain: domain, callback: callback, - decide_action: decide_action + overwrite_partial: overwrite_partial }; }, - /** - * Default decision logic for whether to store an item. - * - * Modules can override via `decide_action` on registration. - * Decisions: - * - 'insert': add a new row (even if another row already exists). - * - 'skip': do not write anything. - * - 'update': replace an existing row's data (keeps the original timestamp_collected). - * - 'merge': shallow-merge incoming item into existing row's data. - */ - default_decide_action: function (item, existing_item) { - if (existing_item) { - return 'skip'; - } - return 'insert'; - }, - /** * Initialise Zeeschuimer * Called on browser session start; increases session index to aid in deduplicating extracted items. @@ -180,6 +165,18 @@ window.zeeschuimer = { } nav_index = nav_index.session + ":" + nav_index.tab_id + ":" + nav_index.index; + const duplicate_behavior_key = 'zs-duplicate-behavior'; + const duplicate_behavior = await browser.storage.local.get(duplicate_behavior_key); + let action_on_duplicate = duplicate_behavior[duplicate_behavior_key] || 'insert'; + if (typeof action_on_duplicate === 'string') { + action_on_duplicate = action_on_duplicate.toLowerCase(); + } + // 'merge' not yet supported via UI (and is untested) + if (!['insert', 'skip', 'update', 'merge'].includes(action_on_duplicate)) { + console.warn('Invalid global duplicate behavior setting', action_on_duplicate, '; using default "insert" behavior'); + action_on_duplicate = 'insert'; + } + let item_list = []; for (let module_id in this.modules) { if(!enabled_modules.includes(module_id)) { @@ -201,40 +198,85 @@ window.zeeschuimer = { await db.transaction('rw', db.items, async () => { const module = this.modules[module_id]; - let existing_item = await db.items.where({ + const existing_item_current_nav = await db.items.where({ "item_id": item_id, "nav_index": nav_index, "source_platform": module_id }).first(); + // Cross-nav lookup: same item_id and platform across all time. + const existing_item_any_nav = await db.items.where({ + "item_id": item_id, + "source_platform": module_id + }).first(); - let decision = null; - if (module && typeof module.decide_action === "function") { - decision = await module.decide_action(item, existing_item, nav_index); - } - - if (!decision) { - decision = this.default_decide_action(item, existing_item); + let action = null; + let target_item = null; + + if (existing_item_current_nav) { + // Item appears again on the same navigation index + // Check module overwrite_partial to determine whether to update or skip. + // This allows modules to update incomplete items that are captured multiple times during the same navigation + // And ensure complete items are not overwritten with partial data + if (module && typeof module.overwrite_partial === "function" && await module.overwrite_partial(item, existing_item_current_nav)) { + // Update existing item with more complete data + action = 'update'; + target_item = existing_item_current_nav; + } else { + // Default for same-nav duplicate is to skip, as it's most likely a true duplicate. + action = 'skip'; + target_item = existing_item_current_nav; + } + } else if (existing_item_any_nav) { + // Item appears again but on a different navigation index. + // Check global fallback behavior to determine action + target_item = existing_item_any_nav; + if (action_on_duplicate === 'insert') { + action = 'insert'; + } else if (action_on_duplicate === 'skip') { + // Only update if module overwrite_partial explicitly returns true for cross-nav duplicates + // This implies we have only capture a partial object at this point + if (module && typeof module.overwrite_partial === "function" && await module.overwrite_partial(item, existing_item_any_nav)) { + action = 'update'; + } else { + action = 'skip'; + } + } else if (["update", "merge"].includes(action_on_duplicate)) { + // Do not update/merge if module overwrite_partial explicitly returns false for cross-nav duplicates + // This prevents us from overwriting complete items with partial data if we have only captured a partial object at this point + if (module && typeof module.overwrite_partial === "function" && await module.overwrite_partial(item, existing_item_any_nav) === false) { + action = 'skip'; + } else { + action = action_on_duplicate; + } + } else { + // Invalid fallback action, default to insert + console.warn('Invalid global duplicate behavior setting', action_on_duplicate, '; using default "insert" behavior'); + action = 'insert'; + } + } else { + // No duplicates, insert new item + action = 'insert'; } - let action = decision; - + // Normalize action string. if (typeof action === 'string') { action = action.toLowerCase(); } + // Validate action; fall back to insert if invalid. if (!['insert', 'skip', 'update', 'merge'].includes(action)) { - console.warn('Invalid decide_action result for module', module_id, action); - action = this.default_decide_action(item, existing_item).toLowerCase(); + console.warn('Invalid action for module', module_id, action, '; using insert'); + action = 'insert'; + target_item = null; } - let target_item = existing_item; - if (action === "skip") { return; } - if (action === "insert" || !target_item) { + if (action === "insert") { // Insert new item with incoming data + console.log('Inserting new item for module', module_id, 'with item_id', item_id); await db.items.add({ "nav_index": nav_index, "item_id": item_id, @@ -250,6 +292,7 @@ window.zeeschuimer = { } if (action === "update") { + console.log('Updating existing item for module', module_id, 'with item_id', item_id); // Replace the stored data with the incoming item, keeping the original timestamp_collected. await db.items.update(target_item.id, { "nav_index": target_item.nav_index, diff --git a/modules/instagram.js b/modules/instagram.js index 0c0e792..fa6d4aa 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -469,14 +469,27 @@ zeeschuimer.register_module( return enriched.filter(edge => edge["product_type"] !== "ad"); }, null, - function (item, existing_item, nav_index) { - // If the existing item is partial, create a new item instead of updating it. - if (existing_item && existing_item.data && existing_item.data._zs_partial === true) { - console.log('Existing item is partial, inserting new item instead of updating'); - return 'insert'; + function (incoming_item, existing_item) { + // Return true if incoming item should replace existing; false otherwise. + // Compare partial vs full: upgrade partial to full; don't downgrade full to partial. + if (!existing_item || !existing_item.data) { + return false; } - // Default behavior (item_id + nav_index) for all other cases. - return null; + const existing_partial = existing_item.data._zs_partial === true; + const incoming_partial = incoming_item && incoming_item._zs_partial === true; + + // Upgrade: partial → full + if (existing_partial && !incoming_partial) { + return true; + } + + // Downgrade protection: full → partial + if (!existing_partial && incoming_partial) { + return false; + } + + // No opinion on same completeness level + return false; } ); \ No newline at end of file diff --git a/popup/interface.html b/popup/interface.html index df41b48..f750476 100644 --- a/popup/interface.html +++ b/popup/interface.html @@ -428,6 +428,22 @@

Uploaded datasets