source: trunk/external/bibtex2xml.py @ 140

Last change on this file since 140 was 67, checked in by pinsard, 16 years ago

introducing bibtex ingestion. still some trouble because of firstane+surname decomposition of personname

  • Property svn:executable set to *
File size: 18.1 KB
Line 
1#!/usr/bin/env python
2# -*- coding: utf-8 -*-
3# Time-stamp: "2006-07-26T09:50:29 vidar"
4"""
5  Decoder for bibliographic data, BibTeX
6  Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
7
8  (c) Vidar Bronken Gundersen, Sara Sprenkle
9  http://bibtexml.sourceforge.net/
10  Reuse approved as long as this notification is kept.
11  License: http://creativecommons.org/licenses/GPL/2.0/
12
13  Contributions/thanks to:
14  Thomas Karl Schwaerzler, read stdin
15  Egon Willighagen, http://jreferences.sf.net/
16  Richard Mahoney, for providing a test case
17
18  This is Sara Sprenkle's rewrite of our original script, which
19  is changed to be more robust and handle more bibtex features:
20  3.  Allow spaces between @type and first {
21  4.  'author' fields with multiple authors split by ' and '
22      are put in separate xml 'bibtex:person' tags.
23  5.  Option for Titles: words are capitalized
24      only if first letter in title or capitalized inside braces
25  6.  Removes braces from within field values
26  7.  Ignores comments in bibtex file (including @comment{ or % )
27  8.  Replaces some special latex tags, e.g., replaces ~ with ' '
28  9.  Handles bibtex @string abbreviations
29        --> includes bibtex's default abbreviations for months
30        --> does concatenation of abbr # ' more ' and ' more ' # abbr
31  10. Handles @type( ... ) or @type{ ... }
32  11. The keywords field is split on , or ; and put into
33      separate xml 'bibtex:keywords' tags
34  12. Ignores @preamble
35
36  replace ':' with '-' for bibtex:entry@id: unique-ids cannot contain ':'
37
38  Known Limitations
39  1.  Does not transform Latex encoding like math mode
40         and special latex symbols.
41  2.  Does not parse author fields into first and last names.
42      E.g., It does not do anything special to an author whose name is
43      in the form LAST_NAME, FIRST_NAME In'author' tag, will show up as
44      <bibtex:author>LAST_NAME, FIRST_NAME</bibtex:author>
45  3.  Does not handle 'crossref' fields other than to print
46      <bibtex:crossref>...</bibtex:crossref>
47  4.  Does not inform user of the input's format errors.
48       You just won't be able to transform the file later with XSL
49       Create error.log file?
50
51  5.  Special treatment of
52      howpublished = '\url{http://www.cs.duke.edu/ari/crisp/}',
53
54  6. document functions with docstrings
55
56  You will have to manually edit the XML output if you need to handle
57  these (and unknown) limitations.
58
59"""
60
61import string, re
62
63# set of valid name characters
64valid_name_chars = '[\w\-:]'
65
66# define global regular expression variables
67author_rex = re.compile('\s+and\s+')
68rembraces_rex = re.compile('[{}]')
69capitalize_rex = re.compile('({\w*})')
70 
71# used by bibtexkeywords(data)
72keywords_rex = re.compile('[,;]')
73
74# used by concat_line(line)
75concatsplit_rex = re.compile('\s*#\s*')
76
77# split on {, }, or " in verify_out_of_braces
78delimiter_rex = re.compile('([{}"])',re.I)
79
80field_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
81data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?')
82
83
84#
85# return the string parameter without braces
86#
87def removebraces(str):
88    return rembraces_rex.sub('',str) 
89
90# fix author so that it creates multiple authors,
91# split by "and"
92def bibtexauthor(data):
93    bibtex = '<bibtex:author>'
94    author_list = author_rex.split(data)
95    if len(author_list) >1:
96        bibtex = bibtex + '\n'
97        for author in author_list:
98            author = author.strip()
99            bibtex = bibtex + '<bibtex:person>' + removebraces(author) + \
100                     '</bibtex:person>' + '\n'
101    else: bibtex = bibtex + removebraces(author_list[0])
102    bibtex = bibtex + '</bibtex:author>'
103    return bibtex.strip()
104       
105
106# @return the bibtex for the title
107# @param data --> title string
108# braces are removed from title
109def bibtextitle(data):
110    title = removebraces(data)
111    title = title.strip()
112    bibtex = '<bibtex:title>' + title + \
113             '</bibtex:title>'
114    return bibtex
115
116
117# @return the bibtex for the keyword
118# keywords are assumed to be delimited by , or ;
119def bibtexkeyword(data):
120    bibtex = ''
121    keyword_list = keywords_rex.split(data)
122    for keyword in keyword_list:
123            keyword = keyword.strip()
124            bibtex = bibtex + '<bibtex:keywords>' + removebraces(keyword) \
125                            + '</bibtex:keywords>' + '\n'
126    return bibtex.strip() 
127
128
129
130# data = title string
131# @return the capitalized title (first letter is capitalized),
132# rest are capitalized only if capitalized inside braces
133def capitalizetitle(data):
134    title_list = capitalize_rex.split(data)
135    title = ''
136    count = 0
137    for phrase in title_list:
138         check = string.lstrip(phrase)
139
140         # keep phrase's capitalization the same
141         if check.find('{') == 0:
142              title = title + removebraces(phrase)
143         else:
144         # first word --> capitalize first letter (after spaces)
145              if count == 0:
146                  title = title + check.capitalize() 
147              else:
148                  title = title + phrase.lower() 
149         count = count + 1
150
151    return title
152
153
154#
155# print the XML for the transformed "filecontents_source"
156#
157def bibtexdecoder(filecontents_source):
158    filecontents = []
159    endentry = ''
160
161    # want @<alphanumeric chars><spaces>{<spaces><any chars>,
162    pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),')
163    endtype_rex = re.compile('}\s*$')
164    endtag_rex = re.compile('^\s*}\s*$')
165
166    #165,166c165,166
167    #<     bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
168    #<     bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?')
169    #---
170    #>     bracefield_rex = re.compile('\s*([^=\s]*)\s*=\s*(.*)')
171    #>     bracedata_rex = re.compile('\s*([^=\s]*)\s*=\s*{(.*)},?')
172
173    bracefield_rex = re.compile('\s*([^=\s]*)\s*=\s*(.*)')
174    bracedata_rex = re.compile('\s*([^=\s]*)\s*=\s*{(.*)},?')
175
176    quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)')
177    quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?')
178
179    for line in filecontents_source:
180        line = line[:-1]
181
182        # encode character entities
183        line = string.replace(line, '&', '&amp;')
184        line = string.replace(line, '<', '&lt;')
185        line = string.replace(line, '>', '&gt;')
186
187        # start item: publication type (store for later use)
188        if pubtype_rex.match(line):
189        # want @<alphanumeric chars><spaces>{<spaces><any chars>,
190            arttype = pubtype_rex.sub('\g<1>',line)
191            arttype = string.lower(arttype)
192            artid   = pubtype_rex.sub('\g<2>', line)
193            artid   = string.replace(artid,':','-')
194            endentry = '</bibtex:' + arttype + '>' + '\n</bibtex:entry>\n'
195            line = '<bibtex:entry id="' + artid + '">\n' + \
196                   '<bibtex:' + arttype + '>'
197        # end item
198
199        # end entry if just a }
200        if endtype_rex.match(line):
201            line = endtag_rex.sub(endentry, line)
202
203        field = ''
204        data = ''
205        # field, publication info
206        # field = {data} entries
207        if bracedata_rex.match(line):
208            field = bracefield_rex.sub('\g<1>', line)
209            field = string.lower(field)
210            data =  bracedata_rex.sub('\g<2>', line)
211           
212        # field = "data" entries
213        elif quotedata_rex.match(line):
214            field = quotefield_rex.sub('\g<1>', line)
215            field = string.lower(field)
216            data =  quotedata_rex.sub('\g<2>', line)
217
218        # field = data entries
219        elif data_rex.match(line):
220            field = field_rex.sub('\g<1>', line)
221            field = string.lower(field)
222            data =  data_rex.sub('\g<2>', line)
223           
224        if field == 'title':
225            line = bibtextitle(data)
226        elif field == 'author':
227            line = bibtexauthor(data)
228        elif field == 'keywords':
229            line = bibtexkeyword(data)
230        elif field != '':
231            data = removebraces(data)
232            data = string.strip(data)
233            if data != '':
234                line = '<bibtex:' + field + '>' + string.strip(data) + \
235                       '</bibtex:' + field + '>'
236            # get rid of the field={} type stuff
237            else:
238                    line = ''
239
240        if line != '':
241                # latex-specific replacements
242                # do this now after braces were removed
243                line = string.replace(line, '~', ' ')#'&#160;')
244                line = string.replace(line, '\\\'a', '&#225;')
245                line = string.replace(line, '\\"a', '&#228;')
246                line = string.replace(line, '\\\'c', '&#263;')
247                line = string.replace(line, '\\"o', '&#246;')
248                line = string.replace(line, '\\o', '&#248;')
249                line = string.replace(line, '\\"u', '&#252;')
250                line = string.replace(line, '---', '&#x2014;')
251                line = string.replace(line, '--', '-')
252
253                filecontents.append(line)
254               
255    return filecontents
256
257#
258# return 1 iff abbr is in line but not inside braces or quotes
259# assumes that abbr appears only once on the line (out of braces and quotes)
260#
261def verify_out_of_braces(line, abbr):
262
263    phrase_split = delimiter_rex.split(line)
264
265    abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
266
267    open_brace = 0
268    open_quote = 0
269
270    for phrase in phrase_split:
271        if phrase == "{":
272            open_brace = open_brace + 1
273        elif phrase == "}":
274            open_brace = open_brace - 1
275        elif phrase == '"':
276            if open_quote == 1:
277                open_quote = 0
278            else:
279                open_quote = 1
280        elif abbr_rex.search(phrase):
281            if open_brace == 0 and open_quote == 0:
282                return 1
283
284    return 0
285
286
287#
288# a line in the form phrase1 # phrase2 # ... # phrasen
289# is returned as phrase1 phrase2 ... phrasen
290# with the correct punctuation
291# Bug: Doesn't always work with multiple abbreviations plugged in
292#
293def concat_line(line):
294    # only look at part after equals
295    field = field_rex.sub('\g<1>',line)
296    rest = field_rex.sub('\g<2>',line)
297
298    concat_line = field + ' ='
299   
300    pound_split = concatsplit_rex.split(rest)
301   
302    phrase_count = 0
303    length = len(pound_split)
304   
305    for phrase in pound_split:
306        phrase = phrase.strip()
307        if phrase_count != 0:
308            if phrase.startswith('"') or phrase.startswith('{'):
309                phrase = phrase[1:]
310        elif phrase.startswith('"'):
311            phrase = phrase.replace('"','{',1)
312
313        if phrase_count != length-1:
314            if phrase.endswith('"') or phrase.endswith('}'):
315                phrase = phrase[:-1]
316        else:
317            if phrase.endswith('"'):
318                phrase = phrase[:-1]
319                phrase = phrase + "}"
320            elif phrase.endswith('",'):
321                phrase = phrase[:-2]
322                phrase = phrase + "},"
323
324        # if phrase did have \#, add the \# back
325        if phrase.endswith('\\'):
326            phrase = phrase + "#"
327        concat_line = concat_line + ' ' + phrase
328
329        phrase_count = phrase_count + 1
330
331    return concat_line
332
333# substitute abbreviations into filecontents
334# @param filecontents_source - string of data from file
335def bibtex_replace_abbreviations(filecontents_source):
336    filecontents = filecontents_source.splitlines()
337
338    #  These are defined in bibtex, so we'll define them too
339    abbr_list = ['jan','feb','mar','apr','may','jun',
340                 'jul','aug','sep','oct','nov','dec']
341    value_list = ['January','February','March','April',
342                  'May','June','July','August','September',
343                  'October','November','December']
344
345    abbr_rex = []
346    total_abbr_count = 0
347
348    front = '\\b'
349    back = '(,?)\\b'
350
351    for x in abbr_list:
352        abbr_rex.append( re.compile(\
353            front + abbr_list[total_abbr_count] + back, re.I ) )
354        total_abbr_count = total_abbr_count + 1
355   
356
357    abbrdef_rex = re.compile('\s*@string\s*{\s*('+\
358                        valid_name_chars +'*)\s*=(.*)', re.I)
359
360    comment_rex = re.compile('@comment\s*{',re.I)
361    preamble_rex = re.compile('@preamble\s*{',re.I)
362
363    waiting_for_end_string = 0
364    i = 0
365    filecontents2 = ''
366
367    for line in filecontents:
368        if line == ' ' or line == '':
369            continue
370
371        if waiting_for_end_string:
372            if re.search('}',line):
373                waiting_for_end_string = 0
374                continue
375
376        if abbrdef_rex.search(line):
377            abbr = abbrdef_rex.sub('\g<1>', line)
378           
379            if abbr_list.count(abbr) == 0:
380                val = abbrdef_rex.sub('\g<2>', line)
381                abbr_list.append(abbr)
382                value_list.append(string.strip(val))
383                abbr_rex.append( re.compile(\
384                    front + abbr_list[total_abbr_count] + back, re.I ) )
385                total_abbr_count = total_abbr_count + 1
386            waiting_for_end_string = 1
387            continue
388
389        if comment_rex.search(line):
390            waiting_for_end_string = 1
391            continue
392
393        if preamble_rex.search(line):
394            waiting_for_end_string = 1
395            continue
396
397
398        # replace subsequent abbreviations with the value
399        abbr_count = 0
400
401        for x in abbr_list:
402           
403            if abbr_rex[abbr_count].search(line):
404                if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
405                    line = abbr_rex[abbr_count].sub(\
406                        value_list[abbr_count] + '\g<1>', line)
407                # Check for # concatenations
408                if concatsplit_rex.search(line):
409                    line = concat_line(line)
410            abbr_count = abbr_count + 1
411         
412
413        filecontents2 = filecontents2 + line + '\n'
414        i = i+1
415
416
417    # Do one final pass over file
418
419    # make sure that didn't end up with {" or }" after the substitution
420    filecontents2 = filecontents2.replace('{"','{{')
421    filecontents2 = filecontents2.replace('"}','}}')
422
423    afterquotevalue_rex = re.compile('"\s*,\s*')
424    afterbrace_rex = re.compile('"\s*}')
425    afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*')
426
427    # add new lines to data that changed because of abbreviation substitutions
428    filecontents2 = afterquotevalue_rex.sub('",\n', filecontents2)
429    filecontents2 = afterbrace_rex.sub('"\n}', filecontents2)
430    filecontents2 = afterbracevalue_rex.sub('\g<1>},\n', filecontents2)
431   
432    return filecontents2
433
434#
435# convert @type( ... ) to @type{ ... }
436#
437def no_outer_parens(filecontents):
438
439    # do checking for open parens
440    # will convert to braces
441    paren_split = re.split('([(){}])',filecontents)
442
443    open_paren_count = 0
444    open_type = 0
445    look_next = 0
446
447    # rebuild filecontents
448    filecontents = ''
449
450    at_rex = re.compile('@\w*')
451   
452    for phrase in paren_split:
453        if look_next == 1:
454                if phrase == '(':
455                        phrase = '{'
456                        open_paren_count = open_paren_count + 1
457                else:
458                        open_type = 0
459                look_next = 0
460
461        if phrase == '(':
462                open_paren_count = open_paren_count + 1
463
464        elif phrase == ')':
465                open_paren_count = open_paren_count - 1
466                if open_type == 1 and open_paren_count == 0:
467                        phrase = '}'
468                        open_type = 0
469               
470        elif at_rex.search( phrase ):
471                open_type = 1
472                look_next = 1
473
474        filecontents = filecontents + phrase
475
476    return filecontents
477   
478
479# make all whitespace into just one space
480# format the bibtex file into a usable form.
481def bibtexwasher(filecontents_source):
482
483    space_rex = re.compile('\s+')
484    comment_rex = re.compile('\s*%')
485   
486    filecontents = []
487
488    # remove trailing and excessive whitespace
489    # ignore comments
490    for line in filecontents_source:
491        line = string.strip(line)
492        line = space_rex.sub(' ', line)
493        # ignore comments
494        if not comment_rex.match(line):
495            filecontents.append(' '+ line)
496
497    filecontents = string.join(filecontents, '')
498
499    # the file is in one long string
500
501    filecontents = no_outer_parens(filecontents)
502
503    #
504    # split lines according to preferred syntax scheme
505    #
506    filecontents = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecontents)
507   
508    # add new lines after commas that are after values
509    filecontents = re.sub('"\s*,', '",\n', filecontents)
510    filecontents = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecontents)   
511    filecontents = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,',
512                          '\n\n\g<1>\g<2>,\n', filecontents)
513
514    # add new lines after }
515    filecontents = re.sub('"\s*}','"\n}\n', filecontents)
516    filecontents = re.sub('}\s*,','},\n', filecontents)
517
518   
519    filecontents = re.sub('@(\w*)', '\n@\g<1>', filecontents)
520
521    # character encoding, reserved latex characters
522    filecontents = re.sub('{\\\&}', '&', filecontents)
523    filecontents = re.sub('\\\&', '&', filecontents)
524
525    # do checking for open braces to get format correct
526    open_brace_count = 0
527    brace_split = re.split('([{}])',filecontents)
528
529    # rebuild filecontents
530    filecontents = ''
531   
532    for phrase in brace_split:
533        if phrase == '{':
534            open_brace_count = open_brace_count + 1
535        elif phrase == '}':
536            open_brace_count = open_brace_count - 1
537            if open_brace_count == 0:
538                filecontents = filecontents + '\n'
539
540        filecontents = filecontents + phrase
541
542    filecontents2 = bibtex_replace_abbreviations(filecontents)
543
544    # gather
545    filecontents = filecontents2.splitlines()
546    i=0
547    j=0         # count the number of blank lines
548    for line in filecontents:
549        # ignore blank lines
550        if line == '' or line == ' ':
551            j = j+1
552            continue
553        filecontents[i] = line + '\n'
554        i = i+1
555
556    # get rid of the extra stuff at the end of the array
557    # (The extra stuff are duplicates that are in the array because
558    # blank lines were removed.)
559    length = len( filecontents)
560    filecontents[length-j:length] = []
561
562    return filecontents
563
564
565def contentshandler(filecontents_source):
566     washeddata = bibtexwasher(filecontents_source)
567     outdata = bibtexdecoder(washeddata)
568     print '<?xml version="1.0" encoding="utf-8"?>'
569     #print '<?xml-stylesheet href="bibtexml.css" type="text/css" ?>'
570     print '<!DOCTYPE bibtex:file PUBLIC'
571     print '    "-//BibTeXML//DTD XML for BibTeX v1.0//EN"'
572     print '    "bibtexml.dtd" >'
573     print '<bibtex:file xmlns:bibtex="http://bibtexml.sf.net/">'
574     print
575     for line in outdata:
576         print line
577     print '  <!-- manual cleanup may be required... -->'
578     print '</bibtex:file>'
579
580
581def filehandler(filepath):
582     try:
583         fd = open(filepath, 'r')
584         filecontents_source = fd.readlines()
585         fd.close()
586     except:
587         print 'Could not open file:', filepath
588     return filecontents_source
589
590
591# main program
592
593def main():
594     import sys
595     if sys.argv[1:]:
596         filepath = sys.argv[1]
597         filecontents_source = filehandler(filepath)
598     else:
599        # instead of exit() read stdin here
600        filecontents_source = sys.stdin.readlines()
601     contentshandler(filecontents_source)
602
603if __name__ == "__main__": main()
604
605
606# end python script
Note: See TracBrowser for help on using the repository browser.