Still needs a lot of cleanup, but the theory is sound.
#!/bin/sh -e # Remove all vids that are 138 bytes in size (empty) case "$(uname -s)" in Linux) xargs_0_stat_size_name() { xargs -0 stat -c '%s %n' } xargs_0_sha256() { xargs -0 sha256sum | sed -e 's/ / /' } ;; FreeBSD) xargs_0_stat_size_name() { xargs -0 stat -f '%z %N' } xargs_0_sha256() { xargs -0 sha256 -r } ;; *) echo "Unsupported API: $(uname -s)" exit -1 ;; esac scrub() { # Relink all duplicate files that are less than 1MB in size find . -type f -print0 | xargs_0_sha256 | sort | awk 'BEGIN{ hash = ""; name = "" }{ if( hash == $1 ){ print "ln -vf " name " " $2 }else{ hash = $1; name = $2 } }' exit $? } shorthash() { printf "%s..%s\n" "$(echo "${1}" | cut -c1-7)" "$(echo "${1}" | cut -c57-64)" } oldhash="" count=1 handle() { hash="${1}" name="${2}" if [ "${oldhash}" = "${hash}" ] then # duplicate file ln -f "${shastor}/${hash}" "${name}" count="$(( ${count} + 1 ))" else # new file echo "Detected hash change after ${count} files: $(shorthash "${oldhash}") -> $(shorthash "${hash}")" ln -f "${name}" "${shastor}/${hash}" oldhash="${hash}" count=1 fi } shalist=cdjs.sha shastor=shas mkdir -p "${shastor}" sort "${shalist}" | while read hash name do handle "${hash}" "${name}" done