#!/bin/sh

# Implementation of HTTP exchange translation described here:
#  http://www.w3.org/2001/tag/awwsw/http.owl

# Argument is a URI.  Output is an RDF graph (in Turtle).

# Test this on:
#  http://purl.org/NET/jar
#  http://purl.org/dc/terms/title

# TBD: foo is a description resource for foo#bar
# TBD: Amazon.com allows GET but not HEAD (HEAD leads to a 405)

set -e
function err {
    echo "#!! $@" 1>&2
    exit 1
}

if [ $# -eq 1 ]; then
  original=$1
  boilerplate=yes
elif [ $# -eq 2 ]; then
  original=$2
  boilerplate=no
else
  err "Usage: $0 uri"
fi

# Request-URI
uri=$1

DEBUG=0

function doit {

  # Don't deal with fragment ids
  if echo "$uri" | grep -q "#" ; then
    docuri=`echo "$uri" | sed -e "s=\([^#]*\).*=\1="`
    err "This case isn't supported: fragment of ${docuri}"
  fi

  curl --silent --head "$uri" | \
  (read noise1 status noise2
    if test $DEBUG -gt 0; then echo "# status is =$status="; fi

    while read -r line; do
      name=`echo "$line" | sed -e "s=\([a-zA-Z-]*\): *.*=\1="`
      # Strip off trailing whitespace, especially the carriage return
      value=`echo "$line" | sed -e "s=[a-zA-Z-]*: *\(.*\)=\1=" \
      		  	  | sed -e "s=[[:space:]]*$=="`
      # Canonicalize case
      name=`echo "$name" | foldcase`

      if [ x"$name" != x ] ; then
        if test $DEBUG -gt 1 ; then echo "# name=$name value=$value"; fi
	if   test "$name" = location ; then location="$value"
	elif test "$name" = content-type ; then content_type="$value"
	elif test "$name" = content-length ; then content_length="$value"
	elif test "$name" = content-location ; then content_location="$value"
	elif test "$name" = content-language ; then content_language="$value"
	elif test "$name" = expires ; then expires="$value"
	elif test "$name" = last-modified ; then last_modified="$value"
	elif test "$name" = etag ; then etag="$value"
	elif test "$name" = date ; then date="$value"
	fi
      fi
    done

    if [ x"$date" != x -a $DEBUG -gt 0 ]; then 
      echo "# got date: $date" 
    fi

    if test "$status" = 200 ; then

      # HEAD+200: note the correspondence
      if test $DEBUG -gt 0 ; then echo "# 200 ..."; fi

      boilerplate
      echo "# 200: correspondence between an entity and the resource"
      echo "<$original> a ht:Get200Candidate."
      echo "[a ht:Correspondence;"
      echo " ht:ofWaRepresentation"

      # Deal with the entity

      echo "  [a ht:Entity;"

      if [ "x$content_language" != x ] ; then
	# TBD
        echo "   ht:inLanguage language:${content_language};"
      fi
      if [ "x$content_type" != x ] && false ; then
	# this has slashes in it. what to do.
	# need to strip off trailing ; charset=
	echo "   ht:hasContentType contentType:${content_type};"
      fi

      if [ "x$content_length" != x ] ; then
        echo "   ht:hasContentLength ${content_length};"
      fi
      if [ "x$etag" != x ] ; then
        echo "   # Server-provided etag = ${etag}"
      fi
      echo "   ];"

      echo " ht:toResource <$original>;"

      # Need to convert all dates to ISO
      if [ "x$date" != x ] ; then
	idate=`isodate "$date"`
        echo " ht:heldAt '${idate}'^^xsd:dateTime;"
      fi
      if [ "x$last_modified" != x ] ; then
	idate=`isodate "$last_modified"`
        echo " ht:heldAt '${idate}'^^xsd:dateTime;"
      fi

      if [ "x$expires" != x ] ; then
	# What does a value of -1 mean here?
	# It means 'already expired'.  See HTTP section 14.21.
	idate=`isodate "$expires"`
        echo " ht:holdsUntil '${idate}'^^xsd:dateTime;"
      fi

      echo " ]."

    elif test "$status" = 301 -o \
    	      "$status" = 302 -o \
    	      "$status" = 307; then
      if [ x"$location" != x ] ; then
	boilerplate
	echo "# $status: redirect"
	echo "<$uri> ht:residesWith <$location>."

	if [ "$status" = 301 ] ; then
	  echo "<$uri> owl:sameAs <$location>."
	fi

        if test $DEBUG -gt 0 ; then 
	  echo "# redirecting to $location using $0 because of $status"; fi
        exec $0 "$location" "$uri"
      fi

    elif test "$status" = 303; then
      if [ x"$location" != x ] ; then
	boilerplate
        echo "# $status: description resource"
	echo "<$uri> wdrs:describedby <$location>."
      fi

    else

      # Otherwise, most likely a 404.

      echo "# $uri failed - status $status"
      exit 1
    fi
  )

}

function boilerplate {
  if [ $boilerplate = yes ] ; then
    echo "# RDF capturing outcome of GET $uri"
    echo ""
    echo "@prefix xsd: <http://www.w3.org/2001/XMLSchema#>."
    echo "@prefix owl: <http://www.w3.org/2002/07/owl#>."
    echo "@prefix wdrs: <http://www.w3.org/2007/05/powder-s#>."
    echo "@prefix ht: <http://www.w3.org/2001/tag/awwsw/http.owl#>."
  fi
  echo ""
}

function foldcase {
  tr ABCDEFGHIJKLMNOPQRSTUVWXYZ abcdefghijklmnopqrstuvwxyz
}

function isodate {
  date=$1
  # Sample from HTTP: Tue, 23 Jun 2009 00:54:20 GMT
  # ISO 8601 format: CCYY-MM-DDThh:mm:ssZ+jj:nn
  tempfile=/tmp/nose.out
  echo $date |
  sed -e 's/..., \(..\) \(...\) \(....\) \(..:..:..\) \(...\)/\1 \2 \3 \4 \5/'\
    >$tempfile
  read day month year hms tz <$tempfile
  if [ x$day != x ] ; then
    #case WORD in [PATTERN [| PATTERN]...) COMMANDS ;;]... esac
    case $month in
      Jan ) month='01' ;;
      Feb ) month='02' ;;
      Mar ) month='03' ;;
      Apr ) month='04' ;;
      May ) month='05' ;;
      Jun ) month='06' ;;
      Jul ) month='07' ;;
      Aug ) month='08' ;;
      Sep ) month='09' ;;
      Oct ) month='10' ;;
      Nov ) month='11' ;;
      Dec ) month='12' ;;
    esac
    case $tz in
      GMT ) tz="" ;;
      UTC ) tz="" ;;
      EDT ) tz="-04:00" ;;
    esac
    date="${year}-${month}-${day}T${hms}Z${tz}"
  fi
  echo $date
}

doit "$uri" "$original"
