source: trunk/twindoi.sh @ 65

Last change on this file since 65 was 63, checked in by pinsard, 16 years ago

usage of xml(starlet) in twindoi.sh

  • Property svn:executable set to *
  • Property svn:keywords set to Id
File size: 2.2 KB
Line 
1#! /bin/sh
2# module :
3# detection of duplicate DOI
4#
5# original location :
6# /usr/home/fplod/src/superbib_ws/twindoi.sh sur aedon.locean-ipsl.upmc.fr
7#
8# example :
9# $ ./twindoi.sh -i data/biball.txt -t raw
10# $ ./twindoi.sh -i data/biball.xml -t xml
11#
12# update :
13# ++ option debug
14# ++ the following command wich is not convinient
15# (xml vs txt) did not give any alert and check inside
16# xml comments
17# $ ./twindoi.sh -i data/biball.xml -t raw
18# $Id$
19# fplod 2008-05-05T14:26:31Z aedon.locean-ipsl.upmc.fr (Darwin)
20# usage of xml(starlet) for doi extraction in xml file
21# fplod 2007-06-20T16:12:22Z aedon.locean-ipsl.upmc.fr (Darwin)
22# consolidation and homogeneisation
23# smasson 2007-06-20T16:11:47Z
24# creation
25#
26#
27set -o posix
28command=$(basename ${0})
29log_date=$(date -u +"%Y-%m-%dT%H:%M:%SZ")
30log=/tmp/${command}.${log_date}
31#
32usage=" Usage : ${command} -i filein -t type"
33#
34while [ ! -z "${1}" ] # ++ pb bash
35do
36 case ${1} in
37 -i) # filein
38  filein=${2}
39  shift
40 ;;
41 -t) # type
42  type=${2}
43  shift
44 ;;
45 *) # other choice
46  echo "${usage}"
47  exit 1
48 ;;
49 esac
50 shift # next flag
51done
52set -u
53#
54# check for filein
55if [ ! -f ${filein} ]
56then
57  echo "eee : ${filein} not found"
58  exit 1
59fi
60#
61case ${type} in
62raw) # file like data/biball.txt
63 fileraw=${filein}
64;;
65xml)  # file like data/biball.xml
66 filexml=${filein}
67;;
68*)
69   echo "eee : type should be raw or xml"
70   exit 1
71;;
72esac
73#
74case ${type} in
75     raw)
76        grep -i "doi:" ${fileraw} | \
77        sed -e "s/^.*doi: *//" | \
78        sed -e "s/^\(.*\)\.$/ \1/" | \
79        grep -v "???" | \
80        sort -d > /tmp/doilist.txt
81        ;;
82     xml)
83        xml sel -N dbk="http://docbook.org/ns/docbook" \
84        -t -m "//dbk:biblioid[@class='doi']" -v . -n ${filexml} | \
85        grep -v "???" | \
86        sort -d > /tmp/doilist.txt
87        ;;
88     *)
89        echo "eee : error unknown file type"
90        exit 1
91        ;;
92esac
93#
94nl=$( cat /tmp/doilist.txt | wc -l )
95if [ ${nl} -eq 0 ]
96then
97   echo "www : no DOI found in ${filein}"
98   rm /tmp/doilist.txt 2> /dev/null
99   exit 1
100fi
101n=1
102while [ ${n} -lt ${nl} ]
103   do
104   l1=$( head -${n} /tmp/doilist.txt | tail -1 )
105   l2=$( head -$(( ${n} + 1 )) /tmp/doilist.txt | tail -1 )
106   [ "${l1}" == "${l2}" ] && echo "eee : line ${n} : ${l1}"
107n=$(( ${n} + 1 ))
108done
109#
110rm /tmp/doilist.txt 2> /dev/null
111exit 0
Note: See TracBrowser for help on using the repository browser.