#!/bin/bash

# Philippe Teuwen <phil_a_teuwen_point_org>
# @doegox

REALPATH=$0
SCRIPT_BASENAME="${REALPATH##*/}"
REALDIR="${REALPATH%$SCRIPT_BASENAME}"
if [ "$REALDIR" ]; then
    cd "$REALDIR"
fi

DICT_PATH="link-grammar"
LINKGRAMMAR="LD_PRELOAD=link-grammar/liblink-grammar.so link-grammar/link-grammar"
secret=$1
secret=${secret:-0}
input=$(cat)
DEBUG=false

# Keep max counts in the encoded stream
# if not, can be reconstructed at decode time
KEEP_MAX=false

# Grammar parsing:
##################

sentence=$(echo -e '!width=5000\n'$input |\
    eval $LINKGRAMMAR en 2>/dev/null |\
    awk '/^[[:space:]]*\|[[:space:]]*\|[[:space:]]*\|/ {
	    getline;
	    sub(/ ?[A-Z]+-WALL ?/,"",$0);
	    print;
	    exit;
	}')
$DEBUG && echo -n "Parsing:   " 1>&2
$DEBUG && echo $sentence 1>&2

# Encoding:
##################

$DEBUG && echo -n "Encoding1: " 1>&2
sentencex=""
for word in $sentence; do
    # Try to find the word in one of the dict files
    file=$(echo '!!'$word |\
	eval $LINKGRAMMAR en 2>/dev/null |\
	grep "<" |\
	sed 's/.*<\(.*\)>.*/\1/')
    if [ "$file" != "" ]; then
	# See which occurence is our word in that file, that's our "encoding"
	n=$(cat $DICT_PATH/$file |\
		sed 's/ $//;s/ /\n/g;'|\
		grep -x -n $word|\
		sed 's/:.*$//')
	t=$(cat $DICT_PATH/$file | wc -w)
	file=${file#en/words/words.}
	# We encode also t but it's just to make ciphering easier
	# as t is just the size of the wordlist
	wordx="$file:$n/$t"
    else
	wordx=$word
    fi
    [ "$sentencex" != "" ] && sentencex="$sentencex "
    sentencex="$sentencex$wordx"
    $DEBUG && echo -n "$wordx " 1>&2
done
$DEBUG && echo 1>&2

# Prepare common words dict for phases 2&3
###########################

TMPDICT=/tmp/4.0.dict.short
# we eliminate the list "a<>an" and "such a<>such an" 
# as the grammar cannot make the diff
# At decoding we ll try to do smart guess
[ -e $TMPDICT ] ||\
    cat $DICT_PATH/en/4.0.dict |\
	tr '\n;' ' \n'|\
	sed 's/^ \+//;s/  / /g'|\
	grep '^[a-z].* [a-z].*:'|\
	sed 's/:.*$//'|\
	grep -v "^\(a an\|such_a such_an\)$" \
	> $TMPDICT

# Encoding, phase 2
###################

$DEBUG && echo -n "Encoding2: " 1>&2
# Restore words with spaces
dictspace=$(cat /tmp/4.0.dict.short |egrep -o "[a-z]+(_[a-z]+)+")
for d in $dictspace; do
    sentencex=$(echo $sentencex|sed "s/$(echo $d|tr '_' ' ')/$d/g")
done

$DEBUG && echo $sentencex 1>&2

# Encoding, phase 3
###################

$DEBUG && echo -n "Encoding3: " 1>&2
sentencey=""
for wordx in $sentencex; do
    if [[ "$wordx" =~ ":" ]]; then
	# This is an already encoded word
	wordy=$wordx
    else
	# Look in the tmpdict for a second chance
	# result is "line_nr:list of words"
	list=$(cat $TMPDICT|grep -n -w -m 1 $wordx|tr : ' ')
	# grep -w considers e.g. vis-a-vis as 3 words
	# so we want to check the match by ourselves
	list=$(echo " $list "|grep " $wordx ")
	if [ "$list" != "" ]; then
	    # We got the word in a list
	    eval $(echo $list|sed 's/^\([0-9]\+\) \(.*\)/l=\1;list="\2"/')
	    n=$(echo $list|\
		tr ' ' '\n'|\
		grep -n $wordx|\
		sed 's/:.*//')
	    t=$(echo $list|wc -w)
	    wordy="@$l:$n/$t"
	else
	    # Replace all "an" by "a" as the grammar cannot make the diff
	    # At decoding we ll try to do smart guess
	    [ "$wordx" == "an" ] && wordx="a"
	    # Clean word extension
	    wordy=${wordx/.*}
	fi
    fi
    [ "$sentencey" != "" ] && sentencey="$sentencey "
    sentencey="$sentencey$wordy"
    $DEBUG && echo -n "$wordy " 1>&2
done
$DEBUG && echo 1>&2

# Ciphering:
##################

$DEBUG && echo -n "Ciphering: " 1>&2
# loosy way to diffuse the secret
secret=$(echo "$secret * 1234567 % 32768"|bc)
oldnx=0

sentencez=""
for wordy in $sentencey; do
    if [[ "$wordy" =~ ":" ]]; then
	# This is an encoded word
	eval $(echo $wordy | sed 's/\(.*\):\(.*\)\/\(.*\)/file=\1;n=\2;t=\3/')
	# Very stupid ciphering: apply secret as a common shift to all the words
	nx=$(echo "($n - 1 + $secret + $oldnx) % $t + 1"|bc)
	[ $secret -ne 0 ] && oldnx=$nx
	wordz="$file:$nx"
	$KEEP_MAX && wordz="$wordz/$t"
    else
	wordz=$wordy
    fi
    [ "$sentencez" != "" ] && sentencez="$sentencez "
    sentencez="$sentencez$wordz"
    $DEBUG && echo -n "$wordz " 1>&2
done
$DEBUG && echo 1>&2

echo $sentencez

