Digital Archivingz
so atm im thinking of using docker compose archivebox w the pihole container active, and then passing in a number of urls parsed from my browser history and bookmarks - on a schedule that runs every day, and the bookmarks get checked every month or so for changes, and archived if they do (not implemented ofc).my ArchiveBox.conf is as follows:
SAVE_WGET_REQUISITES = True PUBLIC_INDEX = False PUBLIC_SNAPSHOTS = False PUBLIC_ADD_VIEW = False SAVE_MERCURY = False SAVE_READABILITY = False SAVE_SCREENSHOT = False SAVE_PDF = False SAVE_TITLE = True SAVE_FAVICON = True SAVE_WGET = True SAVE_WARC = True SAVE_DOM = True SAVE_SINGLEFILE = True SAVE_MEDIA = True SAVE_GIT = True SAVE_ARCHIVE_DOT_ORG = Truethis is mostly for cutting down on formats i dont need (screenshot, pdf) and formats that only change already existing data (mercury, readability).
## TODO : add the script for the thgingy -- update i has made the script and am debug it and get it in finished state !!
The archival script
#!/usr/bin/env python
import sqlite3
import subprocess
from pathlib import Path, PosixPath
# ignore takes precidence
must_have_urls = [
"neocities", # etc
]
ignore_urls = [
"duckduckgo.com", # etc
]
def process_exists(proc, ty = 0):
ps = subprocess.Popen("ps -A", shell=True, stdout=subprocess.PIPE)
ps_pid = ps.pid
output = str(ps.stdout.read())
ps.stdout.close()
ps.wait()
for line in output.split("\\n"):
#print(line)
if line != "" and line != None:
fields = line.split()
if len(fields) >=3:
pid = str(fields[0]).strip()
pname = str(fields[3]).strip()
if(ty == 0):
if(pname == proc):
return True
else:
if(pid == proc):
return True
return False
if process_exists("librewolf"):
raise Exception("librewolf is running !!! *dies*")
places_db = PosixPath("/home/user/.librewolf/{profile}/places.sqlite")
latest_id_path = Path("latest_id.txt")
latest_id = int(latest_id_path.read_text())
print(places_db.resolve())
tmp_db = Path("/tmp/firefox_places.db")
tmp_db.touch()
try:
tmp_db.write_bytes(places_db.read_bytes())
print(tmp_db.exists())
except:
print("cant move")
history_table = "moz_places"
urls = []
ids = []
try:
con = sqlite3.connect(str(tmp_db))
con.text_factory = lambda data: str(data, errors="surrogateescape") # bc might not be utf8
cur = con.cursor()
res = cur.execute("SELECT id,url FROM " + history_table + " WHERE not url is NULL")
for row in res:
skip = False
if row[0] <= latest_id: skip=True
for url in ignore_urls:
if url in row[1]:
skip = True
if not skip:
url = row[1]
if '#' in url:
url = url[:url.find("#")]
if '__cf_' in url:
url = url[:url.find("__cf_")]
if not url in urls and not url.replace("http://", "https://") in urls:
if len(url) < 40:
urls.append(url)
ids.append(row[0])
print(row, url)
else:
for musturl in must_have_urls:
if musturl in url:
urls.append(url)
ids.append(row[0])
print(row, url)
print(len(urls))
con.close()
except Exception as e:
print("hav u closed firefox?")
print(e)
tmp_db.unlink()
length = len(urls)
for i in range(0, length, 25):
print(i)
subprocess.run(["docker", "compose", "run", "archivebox", "add", " ".join(urls[i:i+25]), "--overwrite"])
if len(ids) > 0:
latest_id_path.write_text(str(ids[i]))
its probly not the best, and i havent made it do bookmarks, which is probably more useful, but thats probably just a tweak of the sql query XP