Skip to content

Commit

Permalink
dchash: Fairly large improvements all round, i.e. a rewrite (#260)
Browse files Browse the repository at this point in the history
* Only evaluate the last `=` assignment
* Correctly handle `+=` assignments
* Correctly (try to) handle ADOC_ATTRIBUTES values
  • Loading branch information
Stefan Knorr committed Sep 8, 2021
1 parent 110da01 commit 7b01d3b
Showing 1 changed file with 67 additions and 33 deletions.
100 changes: 67 additions & 33 deletions bin/docserv-dchash
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
#!/bin/bash
# Create hash sum that identifies the unique features of a DC file, simplifying
# grouping in the overview page.

# $1 - DC
# $2 - alternate ROOTID (optional)
#
# -m - output minimal version of DC file
# $2 - DC
# $3 - alternate ROOTID (optional)

out() {
>&2 echo -e "$1"
exit 1
}

me=$(test -L $(realpath $0) && readlink $(realpath $0) || echo $(realpath $0))
mydir=$(dirname $me)
me=$(test -L $(realpath "$0") && readlink $(realpath "$0") || echo $(realpath "$0"))
mydir=$(dirname "$me")

output_mini=0
[[ "$1" == '-m' ]] && { output_mini=1; shift; }

dcfile=$(realpath $1)
[[ ! -f "$dcfile" ]] && out "No input DC file given."
Expand All @@ -21,10 +25,11 @@ dcfile=$(realpath $1)
# * strictly uses line breaks (\n), there is no way to combine multiple
# attributes on a single source line
# * allows continuing on `+=` lines; doing so automatically adds a space in
# the middle, using `MAIN=book_ \n MAIN+=admin.xml` = `MAIN=book_ admin.xml`
# the middle, rather hilariously, using `MAIN=book_ \n MAIN+=admin.xml` ==
# `MAIN=book_ admin.xml` -- so close!
# * in most cases quotes are irrelevant for the content (MAIN, ROOTID, PDFNAME,
# OUTPUTNAME, PROF...), in other cases (ADOC_ATTRIBUTES, and potentially
# XSLTPARAM), they may be extremely relevant
# XSLTPARAM in the future), they may be extremely relevant
#
# Unfortunately, including profiling data (PROF[A-Z]+) in the
# minimized DC files is both necessary and a source of errors. e.g.
Expand All @@ -34,37 +39,66 @@ dcfile=$(realpath $1)
# because it does not touch on virtualization.
# However, it is necessary e.g. for SLES for SAP where the same guide is
# shipped with either "quick start" or "full guide" profiling.
minimaldc=$( \
cat "$dcfile" | \
sed -r \
-e 's/(\r+|\s+|"|'"'"')//g' \
| \
sed -r -n '/^(ROOTID|MAIN|PDFNAME|OUTPUTNAME|PROF[A-Z]+|ADOC_ATTRIBUTES)\+?=/ p' \
)

dc_text=$(cat "$dcfile")

# normalize line ends and =/+= assignments, remove quotes around values,
# normalize PDFNAME to OUTPUTNAME
normalized=$(echo -e "$dc_text" | sed -r \
-e 's/\r/\n/g' \
-e 's/^\s*([A-Z_-]+)\s*(\+?=)\s*/\1\2/g' \
-e 's/^([A-Z_-]+\+?=)"(.*)"\s*$/\1\2/g' \
-e 's/^([A-Z_-]+\+?=)'"'"'(.*)'"'"'\s*$/\1\2/g' \
-e 's/^PDFNAME(\+?=)/OUTPUTNAME\1/g' \
)

# append fake ROOTID
if [[ "$2" ]]; then
minimaldc=$(echo -e "$minimaldc" | sed -r -n '/^ROOTID=/ !p')'\nROOTID='"$2"
normalized+='\nROOTID='"$2"
fi

# Normalize profiling attributes: PROFOS="osuse;sles" is logically the same as
# PROFOS = sles;osuse; , so make those differences disappear
dc_length=$(echo -e "$minimaldc" | wc -l)
minimaldc2=''
for l in $(seq 1 $dc_length); do
line=$(echo -e "$minimaldc" | sed -n "$l p")
if [[ $(echo -e "$line" | grep -P '^PROF') ]]; then
attribute=$(echo -e "$line" | grep -oP '^[^=]+')
values=$(echo -e "$line" | grep -oP '[^=]+$' | tr ';' '\n' | sort -u | sed -n '/^$/ !p' | tr '\n' ';')
minimaldc2+="\n${attribute}=${values}"
elif [[ $(echo -e "$line" | grep -P '^ROOTID') ]] && [[ "$2" ]]; then
minimaldc2+='\nROOTID='"$2"
else
minimaldc2+="\n${line}"
# remove lines for comments, empty lines, lines with attributes we don't care about
relevant_lines=$(echo -e "$normalized" | \
sed -r -n '/^(ROOTID|MAIN|OUTPUTNAME|PROF[A-Z]+|ADOC_ATTRIBUTES)\+?=/ p' \
)

minimaldc=''

all_attributes=$(echo -e "$relevant_lines" | grep -oP '^[A-Z_-]+' | sort -u)
for attr in $all_attributes; do
# if the same attribute appears multiple times, choose the last regular
# (= not +=) occurrence; if there is no `=` occurrence, we just get all `+=`,
# this matches DAPS behavior as of 3.2, cf.
# https://github.com/openSUSE/daps/issues/650
this_attr=$(echo -e "$relevant_lines" | sed -n '/'"$attr"'/ p' | tac | \
sed -n '1,/^[A-Z_-]+=/ p' | tac)
# replace `+=` with `= `
this_attr=$(echo -e "$this_attr" | sed -r 's/^([A-Z_-]+)\+=/\1= /')
this_attr=$(echo -e "$this_attr" | tr '\n' '\r' | sed -r 's/\r([A-Z_-]+)=//g' | tr '\r' '\n')

# special handling for PROF* and ADOC_ATTRIBUTES
if [[ "$attr" =~ ^PROF ]]; then
# resort alphabetically, dedupe, add final `;`
values=$(echo -e "$this_attr" | sed -r 's/^[^=]+=//' | tr ';' '\n' | sort -u | sed -n '/^$/ !p' | tr '\n' ';')
this_attr="${attr}=${values}"
elif [[ "$attr" == "ADOC_ATTRIBUTES" ]]; then
# resort alphabetically, dedupe (keep last definition), normalize
# space/= syntax, (hope for the best, because this is a little
# complicated and it's error-prone to just break at --attribute)
adoc_attrs_unsorted=$(echo -e "$this_attr" | sed -r -e 's/^[^=]+= *//' -e 's/\s*--attribute(=|\s+)/Ⰳ/g' | tr '' '\n' | sed -n '/^$/ !p' | tac)
adoc_attrs_only=$(echo -e "$adoc_attrs_unsorted" | grep -oP '^[^=]+' | sort -u)
values=''
for adoc_attr in $adoc_attrs_only; do
values+=' --attribute '$(echo -e "$adoc_attrs_unsorted" | grep -m1 '^'"$adoc_attr")
done
values=$(echo "$values" | sed -r 's/^ //')
this_attr="${attr}=${values}"
fi
done

minimaldc=$(echo -e "$minimaldc2" | sort -u | sed -n '/^$/ !p')
minimaldc+="\n$this_attr"
done

# Without the final `tr`, we would needlessly write out a \n / 
 which
# would end up in our cache files.
[[ "$output_mini" -ne 0 ]] && echo -n "# "
echo -e "$minimaldc" | sha1sum | cut -f1 -d' ' | tr -d '\n'

[[ "$output_mini" -ne 0 ]] && echo -e "\n${minimaldc}"

0 comments on commit 7b01d3b

Please sign in to comment.