Digital Archivingz
so atm im thinking of using docker compose archivebox w the pihole container active, and then passing in a number of urls parsed from my browser history and bookmarks - on a schedule that runs every day, and the bookmarks get checked every month or so for changes, and archived if they do (not implemented ofc).my ArchiveBox.conf is as follows:
SAVE_WGET_REQUISITES = True PUBLIC_INDEX = False PUBLIC_SNAPSHOTS = False PUBLIC_ADD_VIEW = False SAVE_MERCURY = False SAVE_READABILITY = False SAVE_SCREENSHOT = False SAVE_PDF = False SAVE_TITLE = True SAVE_FAVICON = True SAVE_WGET = True SAVE_WARC = True SAVE_DOM = True SAVE_SINGLEFILE = True SAVE_MEDIA = True SAVE_GIT = True SAVE_ARCHIVE_DOT_ORG = Truethis is mostly for cutting down on formats i dont need (screenshot, pdf) and formats that only change already existing data (mercury, readability).
## TODO : add the script for the thgingy -- update i has made the script and am debug it and get it in finished state !!
The archival script
#!/usr/bin/env python
import sqlite3
import subprocess
from pathlib import Path, PosixPath
# ignore takes precidence
must_have_urls = [
        "neocities", # etc
        ]
ignore_urls = [
        "duckduckgo.com", # etc
        ]
def process_exists(proc, ty = 0):
    ps = subprocess.Popen("ps -A", shell=True, stdout=subprocess.PIPE)
    ps_pid = ps.pid
    output = str(ps.stdout.read())
    ps.stdout.close()
    ps.wait()
    for line in output.split("\\n"):
        #print(line)
        if line != "" and line != None:
            fields = line.split()
            if len(fields) >=3:
                pid = str(fields[0]).strip()
                pname = str(fields[3]).strip()
    
        if(ty == 0):
            if(pname == proc):
                return True
        else:
            if(pid == proc):
                return True
    return False
if process_exists("librewolf"):
    raise Exception("librewolf is running !!! *dies*")
places_db = PosixPath("/home/user/.librewolf/{profile}/places.sqlite")
latest_id_path = Path("latest_id.txt")
latest_id = int(latest_id_path.read_text())
print(places_db.resolve())
tmp_db = Path("/tmp/firefox_places.db")
tmp_db.touch()
try:
    tmp_db.write_bytes(places_db.read_bytes())
    print(tmp_db.exists())
except:
    print("cant move")
history_table = "moz_places"
urls = []
ids = []
try:
    con = sqlite3.connect(str(tmp_db))
    con.text_factory = lambda data: str(data, errors="surrogateescape") # bc might not be utf8
    cur = con.cursor()
    res = cur.execute("SELECT id,url FROM " + history_table + " WHERE not url is NULL")
    for row in res:
        skip = False
        if row[0] <= latest_id: skip=True
        for url in ignore_urls:
            if url in row[1]:
                skip = True
        
        if not skip:
            url = row[1]
            if '#' in url:
                url = url[:url.find("#")]
            if '__cf_' in url:
                url = url[:url.find("__cf_")]
            if not url in urls and not url.replace("http://", "https://") in urls:
                if len(url) < 40:
                    urls.append(url)
                    ids.append(row[0])
                    print(row, url)
                else:
                    for musturl in must_have_urls:
                        if musturl in url:
                            urls.append(url)
                            ids.append(row[0])
                            print(row, url)
    print(len(urls))
    con.close()
except Exception as e: 
    print("hav u closed firefox?")
    print(e)
tmp_db.unlink()
length = len(urls)
for i in range(0, length, 25):
    print(i)
    subprocess.run(["docker", "compose", "run", "archivebox", "add", " ".join(urls[i:i+25]), "--overwrite"])
    if len(ids) > 0:
        latest_id_path.write_text(str(ids[i]))
its probly not the best, and i havent made it do bookmarks, which is probably more useful, but thats probably just a tweak of the sql query XP