[20] | 1 | #! /bin/sh |
---|
| 2 | # module : |
---|
| 3 | # detection of duplicate DOI |
---|
| 4 | # |
---|
| 5 | # original location : |
---|
| 6 | # /usr/home/fplod/src/superbib_ws/twindoi.sh sur aedon.locean-ipsl.upmc.fr |
---|
| 7 | # |
---|
| 8 | # example : |
---|
| 9 | # $ ./twindoi.sh -i data/biball.txt -t raw |
---|
| 10 | # $ ./twindoi.sh -i data/biball.xml -t xml |
---|
| 11 | # |
---|
[48] | 12 | # update : |
---|
[20] | 13 | # ++ option debug |
---|
| 14 | # ++ the following command wich is not convinient |
---|
| 15 | # (xml vs txt) did not give any alert and check inside |
---|
| 16 | # xml comments |
---|
[23] | 17 | # $ ./twindoi.sh -i data/biball.xml -t raw |
---|
[20] | 18 | # $Id$ |
---|
[63] | 19 | # fplod 2008-05-05T14:26:31Z aedon.locean-ipsl.upmc.fr (Darwin) |
---|
| 20 | # usage of xml(starlet) for doi extraction in xml file |
---|
| 21 | # fplod 2007-06-20T16:12:22Z aedon.locean-ipsl.upmc.fr (Darwin) |
---|
| 22 | # consolidation and homogeneisation |
---|
[20] | 23 | # smasson 2007-06-20T16:11:47Z |
---|
| 24 | # creation |
---|
| 25 | # |
---|
| 26 | # |
---|
| 27 | set -o posix |
---|
[56] | 28 | command=$(basename ${0}) |
---|
[20] | 29 | log_date=$(date -u +"%Y-%m-%dT%H:%M:%SZ") |
---|
| 30 | log=/tmp/${command}.${log_date} |
---|
| 31 | # |
---|
| 32 | usage=" Usage : ${command} -i filein -t type" |
---|
| 33 | # |
---|
| 34 | while [ ! -z "${1}" ] # ++ pb bash |
---|
| 35 | do |
---|
| 36 | case ${1} in |
---|
[23] | 37 | -i) # filein |
---|
[20] | 38 | filein=${2} |
---|
| 39 | shift |
---|
| 40 | ;; |
---|
[23] | 41 | -t) # type |
---|
[20] | 42 | type=${2} |
---|
| 43 | shift |
---|
| 44 | ;; |
---|
| 45 | *) # other choice |
---|
| 46 | echo "${usage}" |
---|
| 47 | exit 1 |
---|
| 48 | ;; |
---|
| 49 | esac |
---|
| 50 | shift # next flag |
---|
| 51 | done |
---|
| 52 | set -u |
---|
| 53 | # |
---|
[23] | 54 | # check for filein |
---|
[20] | 55 | if [ ! -f ${filein} ] |
---|
| 56 | then |
---|
| 57 | echo "eee : ${filein} not found" |
---|
| 58 | exit 1 |
---|
| 59 | fi |
---|
| 60 | # |
---|
| 61 | case ${type} in |
---|
| 62 | raw) # file like data/biball.txt |
---|
| 63 | fileraw=${filein} |
---|
| 64 | ;; |
---|
| 65 | xml) # file like data/biball.xml |
---|
| 66 | filexml=${filein} |
---|
| 67 | ;; |
---|
[23] | 68 | *) |
---|
[20] | 69 | echo "eee : type should be raw or xml" |
---|
| 70 | exit 1 |
---|
| 71 | ;; |
---|
| 72 | esac |
---|
| 73 | # |
---|
| 74 | case ${type} in |
---|
| 75 | raw) |
---|
| 76 | grep -i "doi:" ${fileraw} | \ |
---|
| 77 | sed -e "s/^.*doi: *//" | \ |
---|
| 78 | sed -e "s/^\(.*\)\.$/ \1/" | \ |
---|
| 79 | grep -v "???" | \ |
---|
| 80 | sort -d > /tmp/doilist.txt |
---|
| 81 | ;; |
---|
| 82 | xml) |
---|
[63] | 83 | xml sel -N dbk="http://docbook.org/ns/docbook" \ |
---|
| 84 | -t -m "//dbk:biblioid[@class='doi']" -v . -n ${filexml} | \ |
---|
[20] | 85 | grep -v "???" | \ |
---|
| 86 | sort -d > /tmp/doilist.txt |
---|
| 87 | ;; |
---|
| 88 | *) |
---|
| 89 | echo "eee : error unknown file type" |
---|
| 90 | exit 1 |
---|
| 91 | ;; |
---|
| 92 | esac |
---|
| 93 | # |
---|
| 94 | nl=$( cat /tmp/doilist.txt | wc -l ) |
---|
| 95 | if [ ${nl} -eq 0 ] |
---|
| 96 | then |
---|
[23] | 97 | echo "www : no DOI found in ${filein}" |
---|
[20] | 98 | rm /tmp/doilist.txt 2> /dev/null |
---|
| 99 | exit 1 |
---|
[23] | 100 | fi |
---|
[20] | 101 | n=1 |
---|
| 102 | while [ ${n} -lt ${nl} ] |
---|
| 103 | do |
---|
| 104 | l1=$( head -${n} /tmp/doilist.txt | tail -1 ) |
---|
| 105 | l2=$( head -$(( ${n} + 1 )) /tmp/doilist.txt | tail -1 ) |
---|
| 106 | [ "${l1}" == "${l2}" ] && echo "eee : line ${n} : ${l1}" |
---|
| 107 | n=$(( ${n} + 1 )) |
---|
| 108 | done |
---|
| 109 | # |
---|
| 110 | rm /tmp/doilist.txt 2> /dev/null |
---|
| 111 | exit 0 |
---|