source: trunk/twindoi.sh @ 347

Last change on this file since 347 was 347, checked in by pinsard, 11 years ago

dynamic xml vs xmlstarlet

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 4.4 KB
Line 
1#! /bin/sh
2#+
3#
4# .. program:: twindoi.sh
5#
6# ==========
7# twindoi.sh
8# ==========
9#
10# SYNOPSIS
11# ========
12#
13# ::
14#
15#  twindoi.sh -i filein -t type
16#
17#
18# DESCRIPTION
19# ===========
20#
21#
22# .. option:: -i <filein>
23# .. option:: -t <type>
24#
25# detection of duplicate DOI
26#
27# EXAMPLES
28# ========
29#
30# To detect duplicate DOI in a raw file::
31#
32#  twindoi.sh -i data/biball.txt -t raw
33#
34#
35# To detect duplicate DOI in a XML/DocBook file::
36#
37#  twindoi.sh -i data/biball.xml -t xml
38#
39# To detect duplicate DOI in a bibtex file::
40#
41#  twindoi.sh -i data/biball.xml -t bibtex
42#
43# TODO
44# ====
45#
46# ++ option debug
47#
48# ++ the following command wich is not convinient
49# (xml vs txt) did not give any alert and check inside xml comments
50# ::
51#
52#  twindoi.sh -i data/biball.xml -t raw
53#
54# EVOLUTIONS
55# ==========
56#
57# $Id$
58#
59# - fplod 20131010T113730Z callisto.locean-ipsl.upmc.fr (Linux)
60#
61#   * dynamic xmlstarlet vs xml
62#
63# - fplod 20120521T080342Z cratos (Linux)
64#
65#   * rename type variable to ftype to avoid usage of a reserved word
66#   * revision of indentation
67#   * typo
68#   * add bibtex as file type
69#
70# - fplod 20100318T083708Z aedon.locean-ipsl.upmc.fr (Darwin)
71#
72#   * unset
73#
74# - fplod 2008-05-05T14:26:31Z aedon.locean-ipsl.upmc.fr (Darwin)
75#
76#   * usage of xml(starlet) for doi extraction in xml file
77#
78# - fplod 2007-06-20T16:12:22Z aedon.locean-ipsl.upmc.fr (Darwin)
79#
80#   * consolidation and homogeneisation
81#
82# - smasson 2007-06-20T16:11:47Z
83#
84#   * creation
85#
86#-
87#
88system=$(uname)
89case "${system}" in
90    AIX|IRIX64)
91        echo "${command} : www : no specific posix checking"
92    ;;
93    *)
94       set -o posix
95    ;;
96esac
97unset system
98#
99set -u
100action=$(basename ${0} .sh)
101command=$(basename ${0})
102log_date=$(date -u +"%Y%m%dT%H%M%SZ")
103log=${PROJECT_LOG}/$(basename ${command} .sh).log.${log_date}
104#
105tool=${xmlcmd}
106type ${tool} 1> /dev/null 2>&1
107status=${?}
108if [ ${status} -ne 0 ]
109then
110    echo "${command} : eee : tool ${tool} not found"
111    exit 1
112fi
113unset tool
114unset status
115#
116usage=" Usage : ${command} -i filein -t type"
117#
118minargcount=4
119#echo " narg ${#}"
120if [ ${#} -lt ${minargcount} ]
121then
122    echo "${command} : eee : not enought arguments"
123    echo "${usage}"
124    exit 1
125fi
126unset minargcount
127#
128while [ ${#} -gt 0 ]
129do
130    case ${1} in
131        -i)
132            filein=${2}
133            shift
134        ;;
135        -t)
136            ftype=${2}
137            shift
138        ;;
139        *)
140           # other choice
141           echo "${command} : eee : unknown option ${1}"
142           echo "${usage}"
143           exit 1
144        ;;
145    esac
146    # next flag
147    shift
148done
149unset usage
150#
151# check for filein
152if [ ! -f ${filein} ]
153then
154    echo "${command} : eee : ${filein} not found"
155    exit 1
156fi
157#
158case ${ftype} in
159    raw) # file like data/biball.txt
160        fileraw=${filein}
161    ;;
162    xml) # file like data/biball.xml
163        filexml=${filein}
164    ;;
165    bibtex) # file like data/biball.bib
166        filebibtex=${filein}
167    ;;
168    *)
169       echo "${command} : eee : type should be raw, xml or bibtex"
170       exit 1
171    ;;
172esac
173unset filein
174#
175case ${ftype} in
176    raw)
177        grep -i "doi:" ${fileraw} | \
178           sed -e "s/^.*doi: *//" | \
179           sed -e "s/^\(.*\)\.$/ \1/" | \
180           grep -v "???" | \
181           sort -d > ${PROJECT_LOG}/${action}${$}.txt
182    ;;
183     bibtex)
184        grep -i "doi *= *" ${filebibtex} | \
185           sed -e "s/^.*doi *= *//" | \
186           sed -e "s/^\(.*\)\.$/ \1/" | \
187           grep -v "???" | \
188           sort -d > ${PROJECT_LOG}/${action}${$}.txt
189    ;;
190    xml)
191        ${xmlcmd} sel -N dbk="http://docbook.org/ns/docbook" \
192          -t -m "//dbk:biblioid[@class='doi']" -v . -n ${filexml} | \
193        grep  -v "???" | \
194        sort -d > ${PROJECT_LOG}/${action}${$}.txt
195    ;;
196    *)
197        echo "${command} : eee : error unknown file type ${ftype}"
198        exit 1
199    ;;
200esac
201unset ftype
202#
203nl=$( cat ${PROJECT_LOG}/${action}${$}.txt | wc -l )
204if [ ${nl} -eq 0 ]
205then
206    echo "${command} : www : no DOI found in ${filein}"
207    rm ${PROJECT_LOG}/${action}${$}.txt 2> /dev/null
208    exit 1
209fi
210n=1
211while [ ${n} -lt ${nl} ]
212do
213    l1=$( head -${n} ${PROJECT_LOG}/${action}${$}.txt | tail -1 )
214    l2=$( head -$(( ${n} + 1 )) ${PROJECT_LOG}/${action}${$}.txt | tail -1 )
215    [ "${l1}" == "${l2}" ] && echo "${command} : eee : line ${n} : ${l1}"
216    unset l1
217    unset l2
218    n=$(( ${n} + 1 ))
219done
220unset n
221unset nl
222#
223rm ${PROJECT_LOG}/${action}${$}.txt 2> /dev/null
224unset command
225unset log
226unset log_date
227#
228#++set
229exit 0
Note: See TracBrowser for help on using the repository browser.