Digital Archivingz
so atm im thinking of using docker compose archivebox w the pihole container active, and then passing in a number of urls parsed from my browser history and bookmarks - on a schedule that runs every day, and the bookmarks get checked every month or so for changes, and archived if they do (not implemented ofc).my ArchiveBox.conf is as follows:
SAVE_WGET_REQUISITES = True PUBLIC_INDEX = False PUBLIC_SNAPSHOTS = False PUBLIC_ADD_VIEW = False SAVE_MERCURY = False SAVE_READABILITY = False SAVE_SCREENSHOT = False SAVE_PDF = False SAVE_TITLE = True SAVE_FAVICON = True SAVE_WGET = True SAVE_WARC = True SAVE_DOM = True SAVE_SINGLEFILE = True SAVE_MEDIA = True SAVE_GIT = True SAVE_ARCHIVE_DOT_ORG = Truethis is mostly for cutting down on formats i dont need (screenshot, pdf) and formats that only change already existing data (mercury, readability).
## TODO : add the script for the thgingy -- update i has made the script and am debug it and get it in finished state !!
The archival script
#!/usr/bin/env python import sqlite3 import subprocess from pathlib import Path, PosixPath # ignore takes precidence must_have_urls = [ "neocities", # etc ] ignore_urls = [ "", # etc ] def process_exists(proc, ty = 0): ps = subprocess.Popen("ps -A", shell=True, stdout=subprocess.PIPE) ps_pid = output = str( ps.stdout.close() ps.wait() for line in output.split("\\n"): #print(line) if line != "" and line != None: fields = line.split() if len(fields) >=3: pid = str(fields[0]).strip() pname = str(fields[3]).strip() if(ty == 0): if(pname == proc): return True else: if(pid == proc): return True return False if process_exists("librewolf"): raise Exception("librewolf is running !!! *dies*") places_db = PosixPath("/home/user/.librewolf/{profile}/places.sqlite") latest_id_path = Path("latest_id.txt") latest_id = int(latest_id_path.read_text()) print(places_db.resolve()) tmp_db = Path("/tmp/firefox_places.db") tmp_db.touch() try: tmp_db.write_bytes(places_db.read_bytes()) print(tmp_db.exists()) except: print("cant move") history_table = "moz_places" urls = [] ids = [] try: con = sqlite3.connect(str(tmp_db)) con.text_factory = lambda data: str(data, errors="surrogateescape") # bc might not be utf8 cur = con.cursor() res = cur.execute("SELECT id,url FROM " + history_table + " WHERE not url is NULL") for row in res: skip = False if row[0] <= latest_id: skip=True for url in ignore_urls: if url in row[1]: skip = True if not skip: url = row[1] if '#' in url: url = url[:url.find("#")] if '__cf_' in url: url = url[:url.find("__cf_")] if not url in urls and not url.replace("http://", "https://") in urls: if len(url) < 40: urls.append(url) ids.append(row[0]) print(row, url) else: for musturl in must_have_urls: if musturl in url: urls.append(url) ids.append(row[0]) print(row, url) print(len(urls)) con.close() except Exception as e: print("hav u closed firefox?") print(e) tmp_db.unlink() length = len(urls) for i in range(0, length, 25): print(i)["docker", "compose", "run", "archivebox", "add", " ".join(urls[i:i+25]), "--overwrite"]) if len(ids) > 0: latest_id_path.write_text(str(ids[i]))
its probly not the best, and i havent made it do bookmarks, which is probably more useful, but thats probably just a tweak of the sql query XP