File enblend-enfuse-html2xhtml of Package enblend-enfuse

#! /bin/sh
#! /bin/dash -x

# This file is part of Enblend.
# Licence details can be found in the file COPYING.

# name:          html2xhtml
# synopsis:      Convert the HTML output of makeinfo(1) to valid XHTML
# author:        Dr. Christoph L. Spiel
# shell version: bash-3.2.39, dash-0.5.4


readonly global__command=`basename $0`
         global__meta_key_val=
readonly global__sed=${SED:-sed}
readonly global__tidy=${TIDY:-tidy}
         global__tidy_flags=""
readonly global__xmllint=${XMLLINT:-xmllint}
         global__xmllint_flags="--valid --noblanks --noent -nonet --nsclean --format --encode iso-8859-1"


failwith_error ()
{
    echo "$global__command: $@" 1>&2
    exit 1
}


convert_with_tidy ()
{
    # We add the system identifier with sed(1) after tidy(1).  See
    # function postprocess_tidy_output.  Also see
    #     http://www.w3.org/QA/2002/04/valid-dtd-list.html
    # for a list of recommended doctype declarations.

    $global__tidy \
        --add-xml-decl yes \
        --doctype '"-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN"' \
        --file /dev/null \
        --new-blocklevel-tags "math, mfenced, mfrac, mover, mrow, msqrt, msub, msup, mtable, mtd, mtr, munder, munderover" \
        --new-empty-tags mspace \
        --new-inline-tags "mathinline, mfencedinline, mi, mn, mo, mrowinline, msqrtinline, msubinline, msupinline, mtext" \
        --sort-attributes alpha \
        --output-encoding latin1 \
        --output-xhtml yes \
        --preserve-entities yes \
        --quiet yes \
        --wrap 0 \
        $global__tidy_flags \
        "$@"
}


# We must postprocess tidy(1)'s output, because 1/ it still contains
# elements that make an XML parser puke and 2/ we have intentionally
# introduced pseudo MathML tags to coerce tidy(1) into formatting them
# inline; these tags must be reverted.
postprocess_tidy_output ()
{
    if test -z "$lazy__meta_tag" && test -n "$global__meta_key_val"; then
        lazy__meta_tag=`echo $global__meta_key_val | \
                          $global__sed -e 's#;#\n#g' | \
                          $global__sed -e 's#\([^,]*\),\(.*\)#<meta name="\1" content="\2" />#'`
    fi

    $global__sed \
        -e '1,9s#""#"http://www.w3.org/2002/04/xhtml-math-svg/xhtml-math-svg.dtd"#' \
        \
        -e "s#^<head>#<head>\n$lazy__meta_tag#" \
        \
        -e 's#<html\([^>]*\) lang="[^"]*"\([^>]*\)#<html\1\2#' \
        -e 's#<ol[^>]*>#<ol>#' \
        -e 's#<t\([dh]\)\([^>]*\)width="[^"]*"\([^>]*\)#<t\1\2\3#' \
        -e 's#<ul\([^>]*\)compact="[^"]*"\([^>]*\)#<ul\1\2#' \
        \
        -e 's#<\(/*\)\(math\|mfenced\|mrow\|msqrt\|msub\|msup\)inline#<\1\2#g'
}


# Makeinfo(1) generates indirect-reference files for each float.
# Their filenames start with "Table_" or "Figure_*", which is how we
# distinguish them from the html files making up the body.
is_indirect_reference_file ()
{
    local filename=$1

    test -z "${filename%Table_*}" || test -z "${filename%Figure_*}"
}


# Convert all html files iside the given directory.
convert_directory ()
{
    local html_directory=$1
    local xhtml_directory=`basename $html_directory .html`.xhtml

    mkdir -p $xhtml_directory

    (
        cd $html_directory || failwith_error "cannot cd to \"$html_directory\""
        ls -1 *.html | \
            $global__sed -ne 's#\(.*\)\.html#s|\1\\.html|\1.xhtml|g#p' > ,fix-file-refs.sed
    )

    printf '' > $html_directory/,fix-indir.sed
    for html_file in $html_directory/*.html; do
        local html_base=`basename $html_file`

        if is_indirect_reference_file $html_base; then
            url=`$global__sed -ne 's#^.*url=\([^"]*\)".*$#\1#p' < $html_file`
            target_file=`echo $url | $global__sed -e 's/#.*$//'`
            echo "s|$html_base|$target_file|" >> $html_directory/,fix-indir.sed
        fi
    done

    for html_file in $html_directory/*.html; do
        local html_base=`basename $html_file`

        if ! is_indirect_reference_file $html_base; then
            local xhtml_file=$xhtml_directory/`basename $html_base .html`.xhtml

	    convert_with_tidy $html_file | \
                postprocess_tidy_output | \
	        $global__sed \
                  -f $html_directory/,fix-indir.sed \
                  -f $html_directory/,fix-file-refs.sed \
	          -e 's|<a\([^>]*\)name="[^"]*"\([^>]*\)|<a\1\2|g' \
	          -e 's|\("[^/"]*\)\.html\([#"]\)|\1.xhtml\2|' | \
	        $global__xmllint $global__xmllint_flags --output $xhtml_file -
            test $? -eq 0 || { rm -f $xhtml_file; exit 1; }
        fi
    done
}


# Convert a single standalone html file.
convert_file ()
{
    local html_file=$1
    local xhtml_file=`basename $html_file .html`.xhtml

    convert_with_tidy $html_file | \
        postprocess_tidy_output | \
        $global__sed -e 's|<a\([^>]*\)name=\([^>]*\)|<a\1id=\2|g' | \
        $global__xmllint $global__xmllint_flags --output $xhtml_file -
    test $? -eq 0 || { rm -f $xhtml_file; exit 1; }
}


convert_html_to_xhtml ()
{
    local html=$1

    if test -d "$html"; then
        convert_directory $html
    else
        convert_file $html
    fi
}


# We prefer to check for all binaries before we get an unpleasant
# surprise in one of the pipes.
check_binaries ()
{
    for v in global__sed global__tidy global__xmllint; do
        eval x=\$$v
        test -z "$x" && failwith_error "variable \"${v#global__}\" is not defined"
        $x --version > /dev/null 2>&1 || \
            failwith_error "cannot execute \"$x\" for \"${v#global__}\""
    done
}


parse_options ()
{
    : $((n=0))

    while test -n "$1"; do
        case "$1" in
            --meta=*)
                if test -z "$global__meta_key_val"; then
                    global__meta_key_val="${1#--meta=}"
                else
                    global__meta_key_val="$global__meta_key_val;${1#--meta=}"
                fi
                ;;
            -h | --help)
                show_help
                exit 0
                ;;
            --tidy-flags=*)
                global__tidy_flags="$global__tidy_flags ${1#--tidy-flags=}"
                ;;
            --xmllint-flags=*)
                global__xmllint_flags="$global__xmllint_flags ${1#--xmllint-flags=}"
                ;;

            --)
                shift
                : $((n=n+1))
                break 2
                ;;

            --*)
                failwith_error "unknown long option \"$1\""
                ;;
            -*)
                failwith_error "unknown short option \"$1\""
                ;;

            *)
                break 2
        esac
        shift
        : $((n=n+1))
    done

    return $n
}


show_help ()
{
    printf %s "\
Usage: $global__command_name [OPTIONS] [HTML...]
Convert HTML files or directories to XHTML.

Options:
      --meta=NAME,CONTENT    add meta tags with NAME and CONTENT
      --tidy-flags=FLAGS     set flags for tidy(1)
      --xmllint-flags=FLAGS  set flags for xmllint(1)
  -h, --help                 show this help screen

Influential environment variables:
  SED                        name of sed binary [default: \"$global__sed\"]
  TIDY                       name of tidy binary [default: \"$global__tidy\"]
  XMLLINT                    name of xmllint binary [default: \"$global__xmllint\"]
"
}



main ()
{
    parse_options "$@"
    shift $?                    # shift away all options

    check_binaries

    for x in "$@"; do
        convert_html_to_xhtml "$x"
    done
}


main "$@"
exit 0
openSUSE Build Service is sponsored by