#!/bin/bash -f
#FINDBIB 2.3, Aug, 2007  K. Goldstein
#
################ FINDBIB 2.0, F. Nesti, August, 26, 1997 ###############
# 
# This script builds a bibliography database from the latex source file.
#
# Usage: findbib foo.tex
#
# Every \cite{...} of a reference in SPIRES-Bibtex standard form 
# (Author:YEARaa)  or in the form of an arXiv reference 
# (arXive-name/yymmnnn OR yymmnnn OR yymm.nnnn)
# is searched in WWW-SPIRES.SLAC.STANFORD.EDU and all the bibtex records
# are returned in a .BIB file.
#
# The value of the variable ARXIV sets the default arxiv to search if the
# cite is of the form \cite{yymmxxx}
#
# To change the default arxiv change the value below:
#
ARXIV='hep-th/'
#ARXIV='hep-ph/'
#
# The .BIB file will have the same name as the .TEX file
#
# The script uses awk, sed, lynx,  bibtex and somes LaTeX. You will have to install 
# these to make sure it works.
#
####################################################################
#
# How the script works:
#
# Steps: 
# Generate an AUX file with LaTeX if not present,
# searches the AUX file for labels, 
# searches SPIRES for records,
# writes them in $1.bib, renaming last as $1.bib.old,
# calls BibTeX on the file (!).
#
# 
#
# 
#######################################################################
# 
#  FINDBIB 2.1, May, 2007  K. Goldstein
# 
# Changes:
#
# 1. Script updated to fix some changes on Spires website
#
# 2. Uses sort to remove dupilcate entries
#
# 3. Can search for  author with double barrel surnames
#
# 4. every cite of the form \cite{yymmxxx} returns a hep-th reference.
# To change the default arxiv the value of ARXIV
#
# 5. every cite of the form \cite{yymm.xxxx} returns a new arxiv reference
#
# 6. Gratuitous comments added so that the script is easy to fix
# when spires changes their format
#
#######################################################################
# FINDBIB 2.2 : fixed some bugs
#######################################################################
#######################################################################
# FINDBIB 2.3 : added some flags
#######################################################################
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation (version 2) 
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
#
########################################

# The main script starts around line 351  
# Useful functions defined:

# Print usage infromation
usage(){
	echo "findbib $VERSION" 
	echo "---"
	echo "Usage: findbib [-ah]  texfile.tex"
	echo 
	echo "Options:"
	echo 
	echo "     -a : Only add new citations"
	echo "     -h : Print this message" 
}

#a counter
cpp(){
	c=$(($c+1))
}

# extract the labels from the .aux file
get_labels() {
###################
# $1 - the .bib file
# $2 - the .aux file
# $3 - flag to see whether we should just get new labels
###################	
	if [[ $3 = 0 ]] 
	then
 		get_all_labels $1 $2
	else
	 	get_new_labels $1 $2
	fi
}

# function to process the labels
parse_aux() {
	echo "---------- EXTRACTING LABELS FROM $1: -------------"

	awk -F"}" '/\\citation\{/ {for(i=1;i<NF+1;i++) print $i}' $1 | \
	awk       '/\\citation\{/ {print substr($0,match($0,/\\citation\{/)+RLENGTH,length)}'\
							  - | awk -F","  '{for (i=1; i<NF+1;i++) print $i}'| sort -u | tee $2 

}


# extract  all the labels from the .aux file
get_all_labels() {
####################
# $1 = the .bib file
# $2 = the .aux file
#########################################
	# Backup old bibtex file
	if test -f $1 
	    then 
		mv -f $1 $1.old
    	fi
        out=bib.labels
        parse_aux $2 $out
}

# extract new labels from the .aux file
get_new_labels() {
######################
# $1 the .bib file
# $2 the .aux file
#########################################
	# remove old files
	if test -f bib.labels
		then 
		rm -f bib.labels
	fi
	
	out=bib.labels_a 
        parse_aux $2 $out
 
	echo "---------- EXTRACTING RECORDS FROM $1: -------------"


	awk -F"{" '/[aA]rticle/ {print $2}' $1 | sed -e's/,//g' | sort | tee bib.labels_b 

	echo "---------- FINDING NEW LABELS: -------------"

	while read label_a
	do
		if [[ ! `grep  $label_a bib.labels_b` ]]
		then
			echo $label_a | tee -a bib.labels
		fi
	done <  bib.labels_a

	#clean up
	rm -f bib.labels_a bib.labels_b 

	nothing_new=0

	if [[ ! -f bib.labels ]] 
	then
		nothing_new=1
	fi
}

# add a comment to the bibtex file
print_header() {
######################
# $1 - the .bib file
# $2 - flag to see whether we should just get new labels
######################
	if [[ $2 = 0 ]] 
	then
		echo "%%" > $1
		echo "%% BIBTEX FILE FOR $TEXF.tex GENERATED BY FINDBIB $VERSION ON" \
			`date +"%x, AT %X."`>>$1 
		echo "%%" >> $1
	else
		echo "%%" >> $1
		echo "%% REFERENCES FOR $TEXF.tex ADDED BY FINDBIB $VERSION ON" \
			`date +"%x, AT %X."`>>$1 
		echo "%%" >> $1
	fi
}


parse_label() {
# convert the label into a search term
#$1 - a label extracted from the .aux file
    ###########################################	
    # is the record of the form Author:yyyyxx ?
    ###########################################	
    if [[ $1 = *:[0-9][0-9][0-9][0-9][a-z][a-z]* ]]
    then
	LABEL=`echo $1 | sed -es=:=/=`
	AUTHOR=${LABEL%/*}
	DATE=`echo ${LABEL#*/} | sed -e's=[a-z]*$==' `
	KEY=`echo ${LABEL#*/} | sed -e's=^....==' `
	# add a space in two word names and remove underscores:
	AUTHOR_SPACE=`echo $AUTHOR | sed -e 's/\([a-z]\)\([A-Z]\)/\1 \2/g' ` 
	SEARCH=A+$AUTHOR_SPACE+AND+DATE+$DATE
	echo $AUTHOR_SPACE $DATE $KEY 
	
    ###########################################	
    # is the record of the form *-*:yymmxxx ?
    ###########################################	
    elif [[ $1 = *-*/[0-9][0-9][0-9][0-9][0-9][0-9][0-9] ]] 
    then
	SEARCH="EPRINT $1"
	echo $1 
    ###########################################	
    # is the record of the form yymmxxx ?
    ###########################################	
    elif [[ $1 = [0-9][0-9][0-9][0-9][0-9][0-9][0-9] ]] 
    then
	SEARCH="EPRINT $ARXIV$1"
	echo $1 
    ###########################################	
    # is the record of the new form yymm.xxxx ?
    ###########################################	
    elif [[ $1 = [0-9][0-9][0-9][0-9]\.[0-9][0-9][0-9][0-9] ]] 
    then
	SEARCH="EPRINT $1"
	echo $1 
    ###########################################	
    ###########################################	
    # is the record some other format ?
    ###########################################	
    else 
	echo $1: PERSONAL LABEL
	flag=1
    fi
}

get_url() {
#######################
# randomly pick a mirror
######################
        MIRRORS=("http://www-library.desy.de" "http://www-spires.fnal.gov" "http://usparc.ihep.su" "http://www-spires.dur.ac.uk" "http://www.yukawa.kyoto-u.ac.jp"  "http://www.slac.stanford.edu")
        element_count=${#MIRRORS[@]}
        let mirror=$RANDOM%$element_count
        BASE=${MIRRORS[$mirror]}
        echo Querying  $BASE
        #####################
        eval "$1=$BASE"/spires/find/hep/www?""
}


download_search (){
#####################
# Download a webpage with the search term $1
# starting at result $2
#####################
	# get a random mirror:
	get_url URL
	OPTIONS="rawcmd=$1&skip=$2&FORMAT=wwwbriefbibtex"
	# download page:    	
	lynx -useragent=$USER_AGENT -source "$URL$OPTIONS" | sed -e '/@Article{/ s/ //g'  >out
	# sleep for 0-2 seconds between searches so we don't stress the servers
	sleep $(($RANDOM%3))
}

get_hits (){		
#$1 is a string that will be set to the number of hits
	#echo $SEARCH
	if [[ `grep Paper out` ]] 
	then	
	 	export $1=`grep Paper out | sed -e 's/^.*of <b>\([0-9]*\).*/\1/' | sort -u`  
		#echo $1
	fi
}

find_eprint_record(){
	if [[ ! `grep "<!-- START RESULTS -->" out` = "" ]]
	then    	
		echo "@Article{$label," >> $BIB
		awk '/@Article\{/,/^}/' out | awk '!/@Article/' - \
		| tee -a $BIB \
		| grep title | sed 's/  */ /g'
	else 
		echo "PAPER NOT FOUND" 
	fi
}
find_record(){		
	awk /$label/,'/^}/' out \
	| tee -a $BIB \
	| grep title | sed 's/  */ /g' 
}
process_filename() {
	if  [ $1 ] 
	then
    		if test -f ${1%.tex}.aux  
    		then 
			TEXF=${1%.tex}
			FILE=${1%.tex}.aux
			BIB=${1%.tex}.bib
    		elif test -f $1 
    		then
			TEXF=${1%.tex}
			latex $1
			FILE=${1%.tex}.aux
			BIB=${1%.tex}.bib
    		else 
			echo "No such file $1."
			exit
    		fi
	else
    		usage
    		exit
	fi
}

test_dependancy () {

test_dep=`whereis $1 | awk -F: '{print $2}'`

if [ -z "$test_dep" ]
	then	
		return 0
	else 	
		return 1
fi
 
}

########################################################

# test dependencies
DEPS=("lynx" "sed" "awk")

for item in ${DEPS[@]}
do
	test_dependancy $item
	if [ $? = "0" ] 
	then
	  echo Findbib needs $item to work. Please install $item
	  exit
	fi
done

#Boundary conditions
VERSION=2.3
c=0
ADD=0
LYNX_VERSION=`lynx -version | head -1 | awk '{print $3}'`
USER_AGENT="Findbib_"$VERSION"_(Lynx/$LYNX_VERSION)"


# Process the input parameters

# Process flags:

while getopts "b:ah" options 
do
  case $options in
    b ) BIBFILE=$OPTARG
	# Unimplemented
	cpp;cpp;;
    a ) ADD=1 
	cpp;;
    h ) usage
	exit 1;;
    \? ) usage
         exit 1;;
    * ) usage
          exit 1;;
  esac
done

shift $c

# Process the name of the texfile

process_filename $1

# get citations from .aux file

get_labels $BIB $FILE $ADD


# print a header to the bibtex file

print_header $BIB $ADD

# start getting the records from spires

echo "---------- REQUESTING RECORDS FROM spires.slac.stanford.edu:"

while read label
do 
    echo "------------------------------------------------------------"
    flag=0
    echo "Searching for label:"
    i=0; NN=0
    
    # parse the label and set $SEARCH to the SPIRES search term  
    parse_label $label
 
    # start a loop to search through the results

    while test $flag = "0"
    do
	
	download_search "$SEARCH" $i    
	
    	let i=i+25
	# find out the number of hits for our search
    	if [[ $NN -eq 0 ]]
	then
		get_hits NN		
		echo  $NN "record(s) found "
    	fi
	if [[ $NN -gt 25 ]]  
	then  
		echo "Searching-----> " $i  
	fi
	# check to see if there were results	
	if [[ $NN -eq 0 ]]
    	then
		flag=1
		echo "PAPER NOT FOUND."
	elif [[ ! `grep $label out` = "" ]]
    	then 
		flag=1
		echo "Found:"
		if [[ $SEARCH = EPRINT* ]] 
		then
			find_eprint_record		
		else
			find_record
		fi
	# check to see if there are no results:
    	   	# check to see if there are papers left:
	elif [[ $i -gt $NN ]]
    	then
		flag=1
		echo " PAPER NOT FOUND."
    	fi
    done


done < bib.labels

# clean up
rm -f bib.labels out

echo "---------- DONE. RECORDS WRITTEN TO $BIB."
echo "---------- NOW RUNNING BIBTEX:"

bibtex $TEXF

exit

