-
-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathcheck-waybackmachine-and-generate-changelog.sh
executable file
·136 lines (118 loc) · 4.08 KB
/
check-waybackmachine-and-generate-changelog.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#!/bin/bash
# TODO: extract URLs from build manifest, download them if needed, and generate corresponding CHANGELOG section for them
# TODO: detect the difference between a found URL that was downloaded with wget, and one found on waybackmachine that hasn't been downloaded yet
# TODO: Autogenerate command to download latest waybackmachine chunks for found URLs (and/or run it)
# eg. wget --no-verbose --directory-prefix orig2/ --force-directories --no-host-directories --cut-dirs 5 http://web.archive.org/web/20230925172110/https://cdn.oaistatic.com/_next/static/chunks/bd26816a-796eab5008811694.js
# TODO: refactor to read the URLs and build date from STDIN/args/similar
# TODO: refactor to read our JSON format for urls/etc?
# List of URLs to check
urls=(
"https://cdn.oaistatic.com/_next/static/chunks/TODO1.js"
"https://cdn.oaistatic.com/_next/static/chunks/TODO2.js"
)
# Date variable (YYYY-MM-DD format)
build_date="2024-01-02"
extract_build_hash() {
if [[ "$1" =~ _buildManifest.js$ ]] || [[ "$1" =~ _ssgManifest.js$ ]]; then
echo "$1" | grep -oE 'static/[^/]+/' | cut -d '/' -f 2
else
echo ""
fi
}
# Initialize variables
download_directory="orig/"
build_hash=""
missing_urls=""
found_urls=""
# Extract build hash from the first buildManifest or ssgManifest URL
for url in "${urls[@]}"; do
build_hash=$(extract_build_hash "$url")
if [[ -n "$build_hash" ]]; then
break
fi
done
# Loop through each URL to attempt download and check with the Wayback Machine API
echo "Attempting to download URLs:"
echo
for url in "${urls[@]}"; do
if wget --no-verbose --directory-prefix "$download_directory" --force-directories --no-host-directories "$url"; then
# wget succeeded, URL is considered found
found_urls+="$url\n"
else
# wget failed, proceed to check with the Wayback Machine API
echo "Checking URL with Wayback Machine: $url"
wayback_result=$(curl -s "http://archive.org/wayback/available?url=$url")
echo "$wayback_result"
if [[ $(jq '.archived_snapshots | length' <<< "$result") -eq 0 ]]; then
# URL not found in Wayback Machine
missing_urls+="$url\n"
else
# URL found in Wayback Machine
found_urls+="$url\n"
fi
fi
echo
done
# Generate changelog entry if there are missing URLs
if [[ -n "$missing_urls" ]] && [[ -n "$found_urls" ]]; then
echo "## ${build_date}Z (\`$build_hash\`) \`[partial archive]\`"
echo
echo "### Notes"
echo
echo "The assets from this build weren't archived at the time, and could only be partially found via Wayback Machine/etc."
echo
echo "### Not From Build Manifest"
echo
elif [[ -n "$missing_urls" ]]; then
echo "## ${build_date}Z (\`$build_hash\`) \`[not archived]\`"
echo
echo "### Notes"
echo
echo "The assets from this build weren't archived at the time, and couldn't be found via Wayback Machine."
echo
echo "### Not From Build Manifest"
echo
elif [[ -n "$found_urls" ]]; then
echo "## ${build_date}Z (\`$build_hash\`)"
echo
echo "### Notes"
echo
echo "TODO"
echo
echo "### Not From Build Manifest"
echo
fi
# Output for found URLs
if [[ -n "$found_urls" ]]; then
echo "#### Archived"
echo
echo "\`\`\`"
echo -e "$found_urls"
echo "\`\`\`"
echo
fi
if [[ -n "$missing_urls" ]]; then
echo "#### Missing"
echo
echo "\`\`\`"
echo -e "$missing_urls"
echo "\`\`\`"
echo
fi
# Generate commit message based on missing and found URLs
if [[ -n "$missing_urls" ]] && [[ -n "$found_urls" ]]; then
# If there are both missing and found URLs
commit_message="[content-partial] add $build_hash info/content from ${build_date}Z"
elif [[ -n "$missing_urls" ]]; then
# If there are only missing URLs
commit_message="[content-missing] add $build_hash info from ${build_date}Z"
elif [[ -n "$found_urls" ]]; then
# If all URLs are found
commit_message="[content] add $build_hash content from ${build_date}Z"
fi
echo "$commit_message"
if [[ -n "$found_urls" ]]; then
echo
echo "You can try downloading the found URLs from WayBack Machine with:"
echo " wget --no-verbose --directory-prefix orig/ --force-directories --no-host-directories --cut-dirs 5 URLS"
fi