source: trunk/twindoi.sh @ 234

Last change on this file since 234 was 205, checked in by pinsard, 12 years ago

remove dupe, fixes,

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 4.1 KB
Line 
1#! /bin/sh
2#+
3#
4# .. program:: twindoi.sh
5#
6# ==========
7# twindoi.sh
8# ==========
9#
10# SYNOPSIS
11# ========
12#
13# ::
14#
15#  twindoi.sh -i filein -t type
16#
17#
18# DESCRIPTION
19# ===========
20#
21#
22# .. option:: -i <filein>
23# .. option:: -t <type>
24#
25# detection of duplicate DOI
26#
27# EXAMPLES
28# ========
29#
30# To detect duplicate DOI in a raw file::
31#
32#  twindoi.sh -i data/biball.txt -t raw
33#
34#
35# To detect duplicate DOI in a XML/DocBook file::
36#
37#  twindoi.sh -i data/biball.xml -t xml
38#
39# To detect duplicate DOI in a bibtex file::
40#
41#  twindoi.sh -i data/biball.xml -t bibtex
42#
43# TODO
44# ====
45#
46# ++ option debug
47#
48# ++ the following command wich is not convinient
49# (xml vs txt) did not give any alert and check inside xml comments
50# ::
51#
52#  twindoi.sh -i data/biball.xml -t raw
53#
54# EVOLUTIONS
55# ==========
56#
57# $Id$
58#
59# - fplod 20120521T080342Z cratos (Linux)
60#
61#   * rename type variable to ftype to avoid usage of a reserved word
62#   * revision of indentation
63#   * typo
64#   * add bibtex as file type
65#
66# - fplod 20100318T083708Z aedon.locean-ipsl.upmc.fr (Darwin)
67#
68#   * unset
69#
70# - fplod 2008-05-05T14:26:31Z aedon.locean-ipsl.upmc.fr (Darwin)
71#
72#   * usage of xml(starlet) for doi extraction in xml file
73#
74# - fplod 2007-06-20T16:12:22Z aedon.locean-ipsl.upmc.fr (Darwin)
75#
76#   * consolidation and homogeneisation
77#
78# - smasson 2007-06-20T16:11:47Z
79#
80#   * creation
81#
82#-
83#
84system=$(uname)
85case "${system}" in
86    AIX|IRIX64)
87        echo "${command} : www : no specific posix checking"
88    ;;
89    *)
90       set -o posix
91    ;;
92esac
93unset system
94#
95set -u
96#
97action=$(basename ${0} .sh)
98command=$(basename ${0})
99log_date=$(date -u +"%Y%m%dT%H%M%SZ")
100log=${PROJECT_LOG}/$(basename ${command} .sh).log.${log_date}
101#
102usage=" Usage : ${command} -i filein -t type"
103#
104minargcount=4
105#echo " narg ${#}"
106if [ ${#} -lt ${minargcount} ]
107then
108    echo "${command} : eee : not enought arguments"
109    echo "${usage}"
110    exit 1
111fi
112unset minargcount
113#
114while [ ${#} -gt 0 ]
115do
116    case ${1} in
117        -i)
118            filein=${2}
119            shift
120        ;;
121        -t)
122            ftype=${2}
123            shift
124        ;;
125        *)
126           # other choice
127           echo "${command} : eee : unknown option ${1}"
128           echo "${usage}"
129           exit 1
130        ;;
131    esac
132    # next flag
133    shift
134done
135unset usage
136#
137# check for filein
138if [ ! -f ${filein} ]
139then
140    echo "${command} : eee : ${filein} not found"
141    exit 1
142fi
143#
144case ${ftype} in
145    raw) # file like data/biball.txt
146        fileraw=${filein}
147    ;;
148    xml) # file like data/biball.xml
149        filexml=${filein}
150    ;;
151    bibtex) # file like data/biball.bib
152        filebibtex=${filein}
153    ;;
154    *)
155       echo "${command} : eee : type should be raw, xml or bibtex"
156       exit 1
157    ;;
158esac
159unset filein
160#
161case ${ftype} in
162    raw)
163        grep -i "doi:" ${fileraw} | \
164           sed -e "s/^.*doi: *//" | \
165           sed -e "s/^\(.*\)\.$/ \1/" | \
166           grep -v "???" | \
167           sort -d > ${PROJECT_LOG}/${action}${$}.txt
168    ;;
169     bibtex)
170        grep -i "doi *= *" ${filebibtex} | \
171           sed -e "s/^.*doi *= *//" | \
172           sed -e "s/^\(.*\)\.$/ \1/" | \
173           grep -v "???" | \
174           sort -d > ${PROJECT_LOG}/${action}${$}.txt
175    ;;
176    xml)
177        xml sel -N dbk="http://docbook.org/ns/docbook" \
178          -t -m "//dbk:biblioid[@class='doi']" -v . -n ${filexml} | \
179        grep  -v "???" | \
180        sort -d > ${PROJECT_LOG}/${action}${$}.txt
181    ;;
182    *)
183        echo "${command} : eee : error unknown file type ${ftype}"
184        exit 1
185    ;;
186esac
187unset ftype
188#
189nl=$( cat ${PROJECT_LOG}/${action}${$}.txt | wc -l )
190if [ ${nl} -eq 0 ]
191then
192    echo "${command} : www : no DOI found in ${filein}"
193    rm ${PROJECT_LOG}/${action}${$}.txt 2> /dev/null
194    exit 1
195fi
196n=1
197while [ ${n} -lt ${nl} ]
198do
199    l1=$( head -${n} ${PROJECT_LOG}/${action}${$}.txt | tail -1 )
200    l2=$( head -$(( ${n} + 1 )) ${PROJECT_LOG}/${action}${$}.txt | tail -1 )
201    [ "${l1}" == "${l2}" ] && echo "${command} : eee : line ${n} : ${l1}"
202    unset l1
203    unset l2
204    n=$(( ${n} + 1 ))
205done
206unset n
207unset nl
208#
209rm ${PROJECT_LOG}/${action}${$}.txt 2> /dev/null
210unset command
211unset log
212unset log_date
213#
214#++set
215exit 0
Note: See TracBrowser for help on using the repository browser.