Context Navigation

source: trunk/external/bibtex2xml.py @ 140

Last change on this file since 140 was 67, checked in by pinsard, 16 years ago
introducing bibtex ingestion. still some trouble because of firstane+surname decomposition of personname
Property svn:executable set to ``*
File size: 18.1 KB

Line
1	#!/usr/bin/env python
2	# -- coding: utf-8 --
3	# Time-stamp: "2006-07-26T09:50:29 vidar"
4	"""
5	Decoder for bibliographic data, BibTeX
6	Usage: python bibtex2xml.py bibfile.bib > bibfile.xml
7
8	(c) Vidar Bronken Gundersen, Sara Sprenkle
9	http://bibtexml.sourceforge.net/
10	Reuse approved as long as this notification is kept.
11	License: http://creativecommons.org/licenses/GPL/2.0/
12
13	Contributions/thanks to:
14	Thomas Karl Schwaerzler, read stdin
15	Egon Willighagen, http://jreferences.sf.net/
16	Richard Mahoney, for providing a test case
17
18	This is Sara Sprenkle's rewrite of our original script, which
19	is changed to be more robust and handle more bibtex features:
20	3. Allow spaces between @type and first {
21	4. 'author' fields with multiple authors split by ' and '
22	are put in separate xml 'bibtex:person' tags.
23	5. Option for Titles: words are capitalized
24	only if first letter in title or capitalized inside braces
25	6. Removes braces from within field values
26	7. Ignores comments in bibtex file (including @comment{ or % )
27	8. Replaces some special latex tags, e.g., replaces ~ with ' '
28	9. Handles bibtex @string abbreviations
29	--> includes bibtex's default abbreviations for months
30	--> does concatenation of abbr # ' more ' and ' more ' # abbr
31	10. Handles @type( ... ) or @type{ ... }
32	11. The keywords field is split on , or ; and put into
33	separate xml 'bibtex:keywords' tags
34	12. Ignores @preamble
35
36	replace ':' with '-' for bibtex:entry@id: unique-ids cannot contain ':'
37
38	Known Limitations
39	1. Does not transform Latex encoding like math mode
40	and special latex symbols.
41	2. Does not parse author fields into first and last names.
42	E.g., It does not do anything special to an author whose name is
43	in the form LAST_NAME, FIRST_NAME In'author' tag, will show up as
44	<bibtex:author>LAST_NAME, FIRST_NAME</bibtex:author>
45	3. Does not handle 'crossref' fields other than to print
46	<bibtex:crossref>...</bibtex:crossref>
47	4. Does not inform user of the input's format errors.
48	You just won't be able to transform the file later with XSL
49	Create error.log file?
50
51	5. Special treatment of
52	howpublished = '\url{http://www.cs.duke.edu/ari/crisp/}',
53
54	6. document functions with docstrings
55
56	You will have to manually edit the XML output if you need to handle
57	these (and unknown) limitations.
58
59	"""
60
61	import string, re
62
63	# set of valid name characters
64	valid_name_chars = '[\w\-:]'
65
66	# define global regular expression variables
67	author_rex = re.compile('\s+and\s+')
68	rembraces_rex = re.compile('[{}]')
69	capitalize_rex = re.compile('({\w*})')
70
71	# used by bibtexkeywords(data)
72	keywords_rex = re.compile('[,;]')
73
74	# used by concat_line(line)
75	concatsplit_rex = re.compile('\s#\s')
76
77	# split on {, }, or " in verify_out_of_braces
78	delimiter_rex = re.compile('([{}"])',re.I)
79
80	field_rex = re.compile('\s(\w)\s=\s(.*)')
81	data_rex = re.compile('\s(\w)\s=\s([^,]*),?')
82
83
84	#
85	# return the string parameter without braces
86	#
87	def removebraces(str):
88	return rembraces_rex.sub('',str)
89
90	# fix author so that it creates multiple authors,
91	# split by "and"
92	def bibtexauthor(data):
93	bibtex = '<bibtex:author>'
94	author_list = author_rex.split(data)
95	if len(author_list) >1:
96	bibtex = bibtex + '\n'
97	for author in author_list:
98	author = author.strip()
99	bibtex = bibtex + '<bibtex:person>' + removebraces(author) + \
100	'</bibtex:person>' + '\n'
101	else: bibtex = bibtex + removebraces(author_list[0])
102	bibtex = bibtex + '</bibtex:author>'
103	return bibtex.strip()
104
105
106	# @return the bibtex for the title
107	# @param data --> title string
108	# braces are removed from title
109	def bibtextitle(data):
110	title = removebraces(data)
111	title = title.strip()
112	bibtex = '<bibtex:title>' + title + \
113	'</bibtex:title>'
114	return bibtex
115
116
117	# @return the bibtex for the keyword
118	# keywords are assumed to be delimited by , or ;
119	def bibtexkeyword(data):
120	bibtex = ''
121	keyword_list = keywords_rex.split(data)
122	for keyword in keyword_list:
123	keyword = keyword.strip()
124	bibtex = bibtex + '<bibtex:keywords>' + removebraces(keyword) \
125	+ '</bibtex:keywords>' + '\n'
126	return bibtex.strip()
127
128
129
130	# data = title string
131	# @return the capitalized title (first letter is capitalized),
132	# rest are capitalized only if capitalized inside braces
133	def capitalizetitle(data):
134	title_list = capitalize_rex.split(data)
135	title = ''
136	count = 0
137	for phrase in title_list:
138	check = string.lstrip(phrase)
139
140	# keep phrase's capitalization the same
141	if check.find('{') == 0:
142	title = title + removebraces(phrase)
143	else:
144	# first word --> capitalize first letter (after spaces)
145	if count == 0:
146	title = title + check.capitalize()
147	else:
148	title = title + phrase.lower()
149	count = count + 1
150
151	return title
152
153
154	#
155	# print the XML for the transformed "filecontents_source"
156	#
157	def bibtexdecoder(filecontents_source):
158	filecontents = []
159	endentry = ''
160
161	# want @<alphanumeric chars><spaces>{<spaces><any chars>,
162	pubtype_rex = re.compile('@(\w)\s{\s(.),')
163	endtype_rex = re.compile('}\s*$')
164	endtag_rex = re.compile('^\s}\s$')
165
166	#165,166c165,166
167	#< bracefield_rex = re.compile('\s(\w)\s=\s(.*)')
168	#< bracedata_rex = re.compile('\s(\w)\s=\s{(.*)},?')
169	#---
170	#> bracefield_rex = re.compile('\s([^=\s])\s=\s(.*)')
171	#> bracedata_rex = re.compile('\s([^=\s])\s=\s{(.*)},?')
172
173	bracefield_rex = re.compile('\s([^=\s])\s=\s(.*)')
174	bracedata_rex = re.compile('\s([^=\s])\s=\s{(.*)},?')
175
176	quotefield_rex = re.compile('\s(\w)\s=\s(.*)')
177	quotedata_rex = re.compile('\s(\w)\s=\s"(.*)",?')
178
179	for line in filecontents_source:
180	line = line[:-1]
181
182	# encode character entities
183	line = string.replace(line, '&', '&')
184	line = string.replace(line, '<', '<')
185	line = string.replace(line, '>', '>')
186
187	# start item: publication type (store for later use)
188	if pubtype_rex.match(line):
189	# want @<alphanumeric chars><spaces>{<spaces><any chars>,
190	arttype = pubtype_rex.sub('\g<1>',line)
191	arttype = string.lower(arttype)
192	artid = pubtype_rex.sub('\g<2>', line)
193	artid = string.replace(artid,':','-')
194	endentry = '</bibtex:' + arttype + '>' + '\n</bibtex:entry>\n'
195	line = '<bibtex:entry id="' + artid + '">\n' + \
196	'<bibtex:' + arttype + '>'
197	# end item
198
199	# end entry if just a }
200	if endtype_rex.match(line):
201	line = endtag_rex.sub(endentry, line)
202
203	field = ''
204	data = ''
205	# field, publication info
206	# field = {data} entries
207	if bracedata_rex.match(line):
208	field = bracefield_rex.sub('\g<1>', line)
209	field = string.lower(field)
210	data = bracedata_rex.sub('\g<2>', line)
211
212	# field = "data" entries
213	elif quotedata_rex.match(line):
214	field = quotefield_rex.sub('\g<1>', line)
215	field = string.lower(field)
216	data = quotedata_rex.sub('\g<2>', line)
217
218	# field = data entries
219	elif data_rex.match(line):
220	field = field_rex.sub('\g<1>', line)
221	field = string.lower(field)
222	data = data_rex.sub('\g<2>', line)
223
224	if field == 'title':
225	line = bibtextitle(data)
226	elif field == 'author':
227	line = bibtexauthor(data)
228	elif field == 'keywords':
229	line = bibtexkeyword(data)
230	elif field != '':
231	data = removebraces(data)
232	data = string.strip(data)
233	if data != '':
234	line = '<bibtex:' + field + '>' + string.strip(data) + \
235	'</bibtex:' + field + '>'
236	# get rid of the field={} type stuff
237	else:
238	line = ''
239
240	if line != '':
241	# latex-specific replacements
242	# do this now after braces were removed
243	line = string.replace(line, '~', ' ')#' ')
244	line = string.replace(line, '\\\'a', 'á')
245	line = string.replace(line, '\\"a', 'ä')
246	line = string.replace(line, '\\\'c', 'ć')
247	line = string.replace(line, '\\"o', 'ö')
248	line = string.replace(line, '\\o', 'ø')
249	line = string.replace(line, '\\"u', 'ü')
250	line = string.replace(line, '---', '—')
251	line = string.replace(line, '--', '-')
252
253	filecontents.append(line)
254
255	return filecontents
256
257	#
258	# return 1 iff abbr is in line but not inside braces or quotes
259	# assumes that abbr appears only once on the line (out of braces and quotes)
260	#
261	def verify_out_of_braces(line, abbr):
262
263	phrase_split = delimiter_rex.split(line)
264
265	abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I)
266
267	open_brace = 0
268	open_quote = 0
269
270	for phrase in phrase_split:
271	if phrase == "{":
272	open_brace = open_brace + 1
273	elif phrase == "}":
274	open_brace = open_brace - 1
275	elif phrase == '"':
276	if open_quote == 1:
277	open_quote = 0
278	else:
279	open_quote = 1
280	elif abbr_rex.search(phrase):
281	if open_brace == 0 and open_quote == 0:
282	return 1
283
284	return 0
285
286
287	#
288	# a line in the form phrase1 # phrase2 # ... # phrasen
289	# is returned as phrase1 phrase2 ... phrasen
290	# with the correct punctuation
291	# Bug: Doesn't always work with multiple abbreviations plugged in
292	#
293	def concat_line(line):
294	# only look at part after equals
295	field = field_rex.sub('\g<1>',line)
296	rest = field_rex.sub('\g<2>',line)
297
298	concat_line = field + ' ='
299
300	pound_split = concatsplit_rex.split(rest)
301
302	phrase_count = 0
303	length = len(pound_split)
304
305	for phrase in pound_split:
306	phrase = phrase.strip()
307	if phrase_count != 0:
308	if phrase.startswith('"') or phrase.startswith('{'):
309	phrase = phrase[1:]
310	elif phrase.startswith('"'):
311	phrase = phrase.replace('"','{',1)
312
313	if phrase_count != length-1:
314	if phrase.endswith('"') or phrase.endswith('}'):
315	phrase = phrase[:-1]
316	else:
317	if phrase.endswith('"'):
318	phrase = phrase[:-1]
319	phrase = phrase + "}"
320	elif phrase.endswith('",'):
321	phrase = phrase[:-2]
322	phrase = phrase + "},"
323
324	# if phrase did have \#, add the \# back
325	if phrase.endswith('\\'):
326	phrase = phrase + "#"
327	concat_line = concat_line + ' ' + phrase
328
329	phrase_count = phrase_count + 1
330
331	return concat_line
332
333	# substitute abbreviations into filecontents
334	# @param filecontents_source - string of data from file
335	def bibtex_replace_abbreviations(filecontents_source):
336	filecontents = filecontents_source.splitlines()
337
338	# These are defined in bibtex, so we'll define them too
339	abbr_list = ['jan','feb','mar','apr','may','jun',
340	'jul','aug','sep','oct','nov','dec']
341	value_list = ['January','February','March','April',
342	'May','June','July','August','September',
343	'October','November','December']
344
345	abbr_rex = []
346	total_abbr_count = 0
347
348	front = '\\b'
349	back = '(,?)\\b'
350
351	for x in abbr_list:
352	abbr_rex.append( re.compile(\
353	front + abbr_list[total_abbr_count] + back, re.I ) )
354	total_abbr_count = total_abbr_count + 1
355
356
357	abbrdef_rex = re.compile('\s@string\s{\s*('+\
358	valid_name_chars +')\s=(.*)', re.I)
359
360	comment_rex = re.compile('@comment\s*{',re.I)
361	preamble_rex = re.compile('@preamble\s*{',re.I)
362
363	waiting_for_end_string = 0
364	i = 0
365	filecontents2 = ''
366
367	for line in filecontents:
368	if line == ' ' or line == '':
369	continue
370
371	if waiting_for_end_string:
372	if re.search('}',line):
373	waiting_for_end_string = 0
374	continue
375
376	if abbrdef_rex.search(line):
377	abbr = abbrdef_rex.sub('\g<1>', line)
378
379	if abbr_list.count(abbr) == 0:
380	val = abbrdef_rex.sub('\g<2>', line)
381	abbr_list.append(abbr)
382	value_list.append(string.strip(val))
383	abbr_rex.append( re.compile(\
384	front + abbr_list[total_abbr_count] + back, re.I ) )
385	total_abbr_count = total_abbr_count + 1
386	waiting_for_end_string = 1
387	continue
388
389	if comment_rex.search(line):
390	waiting_for_end_string = 1
391	continue
392
393	if preamble_rex.search(line):
394	waiting_for_end_string = 1
395	continue
396
397
398	# replace subsequent abbreviations with the value
399	abbr_count = 0
400
401	for x in abbr_list:
402
403	if abbr_rex[abbr_count].search(line):
404	if verify_out_of_braces(line,abbr_list[abbr_count]) == 1:
405	line = abbr_rex[abbr_count].sub(\
406	value_list[abbr_count] + '\g<1>', line)
407	# Check for # concatenations
408	if concatsplit_rex.search(line):
409	line = concat_line(line)
410	abbr_count = abbr_count + 1
411
412
413	filecontents2 = filecontents2 + line + '\n'
414	i = i+1
415
416
417	# Do one final pass over file
418
419	# make sure that didn't end up with {" or }" after the substitution
420	filecontents2 = filecontents2.replace('{"','{{')
421	filecontents2 = filecontents2.replace('"}','}}')
422
423	afterquotevalue_rex = re.compile('"\s,\s')
424	afterbrace_rex = re.compile('"\s*}')
425	afterbracevalue_rex = re.compile('(=\s{[^=])},\s*')
426
427	# add new lines to data that changed because of abbreviation substitutions
428	filecontents2 = afterquotevalue_rex.sub('",\n', filecontents2)
429	filecontents2 = afterbrace_rex.sub('"\n}', filecontents2)
430	filecontents2 = afterbracevalue_rex.sub('\g<1>},\n', filecontents2)
431
432	return filecontents2
433
434	#
435	# convert @type( ... ) to @type{ ... }
436	#
437	def no_outer_parens(filecontents):
438
439	# do checking for open parens
440	# will convert to braces
441	paren_split = re.split('([(){}])',filecontents)
442
443	open_paren_count = 0
444	open_type = 0
445	look_next = 0
446
447	# rebuild filecontents
448	filecontents = ''
449
450	at_rex = re.compile('@\w*')
451
452	for phrase in paren_split:
453	if look_next == 1:
454	if phrase == '(':
455	phrase = '{'
456	open_paren_count = open_paren_count + 1
457	else:
458	open_type = 0
459	look_next = 0
460
461	if phrase == '(':
462	open_paren_count = open_paren_count + 1
463
464	elif phrase == ')':
465	open_paren_count = open_paren_count - 1
466	if open_type == 1 and open_paren_count == 0:
467	phrase = '}'
468	open_type = 0
469
470	elif at_rex.search( phrase ):
471	open_type = 1
472	look_next = 1
473
474	filecontents = filecontents + phrase
475
476	return filecontents
477
478
479	# make all whitespace into just one space
480	# format the bibtex file into a usable form.
481	def bibtexwasher(filecontents_source):
482
483	space_rex = re.compile('\s+')
484	comment_rex = re.compile('\s*%')
485
486	filecontents = []
487
488	# remove trailing and excessive whitespace
489	# ignore comments
490	for line in filecontents_source:
491	line = string.strip(line)
492	line = space_rex.sub(' ', line)
493	# ignore comments
494	if not comment_rex.match(line):
495	filecontents.append(' '+ line)
496
497	filecontents = string.join(filecontents, '')
498
499	# the file is in one long string
500
501	filecontents = no_outer_parens(filecontents)
502
503	#
504	# split lines according to preferred syntax scheme
505	#
506	filecontents = re.sub('(=\s{[^=])},', '\g<1>},\n', filecontents)
507
508	# add new lines after commas that are after values
509	filecontents = re.sub('"\s*,', '",\n', filecontents)
510	filecontents = re.sub('=\s([\w\d]+)\s,', '= \g<1>,\n', filecontents)
511	filecontents = re.sub('(@\w)\s({(\s)[^,\s])\s*,',
512	'\n\n\g<1>\g<2>,\n', filecontents)
513
514	# add new lines after }
515	filecontents = re.sub('"\s*}','"\n}\n', filecontents)
516	filecontents = re.sub('}\s*,','},\n', filecontents)
517
518
519	filecontents = re.sub('@(\w*)', '\n@\g<1>', filecontents)
520
521	# character encoding, reserved latex characters
522	filecontents = re.sub('{\\\&}', '&', filecontents)
523	filecontents = re.sub('\\\&', '&', filecontents)
524
525	# do checking for open braces to get format correct
526	open_brace_count = 0
527	brace_split = re.split('([{}])',filecontents)
528
529	# rebuild filecontents
530	filecontents = ''
531
532	for phrase in brace_split:
533	if phrase == '{':
534	open_brace_count = open_brace_count + 1
535	elif phrase == '}':
536	open_brace_count = open_brace_count - 1
537	if open_brace_count == 0:
538	filecontents = filecontents + '\n'
539
540	filecontents = filecontents + phrase
541
542	filecontents2 = bibtex_replace_abbreviations(filecontents)
543
544	# gather
545	filecontents = filecontents2.splitlines()
546	i=0
547	j=0 # count the number of blank lines
548	for line in filecontents:
549	# ignore blank lines
550	if line == '' or line == ' ':
551	j = j+1
552	continue
553	filecontents[i] = line + '\n'
554	i = i+1
555
556	# get rid of the extra stuff at the end of the array
557	# (The extra stuff are duplicates that are in the array because
558	# blank lines were removed.)
559	length = len( filecontents)
560	filecontents[length-j:length] = []
561
562	return filecontents
563
564
565	def contentshandler(filecontents_source):
566	washeddata = bibtexwasher(filecontents_source)
567	outdata = bibtexdecoder(washeddata)
568	print '<?xml version="1.0" encoding="utf-8"?>'
569	#print '<?xml-stylesheet href="bibtexml.css" type="text/css" ?>'
570	print '<!DOCTYPE bibtex:file PUBLIC'
571	print ' "-//BibTeXML//DTD XML for BibTeX v1.0//EN"'
572	print ' "bibtexml.dtd" >'
573	print '<bibtex:file xmlns:bibtex="http://bibtexml.sf.net/">'
574	print
575	for line in outdata:
576	print line
577	print ' <!-- manual cleanup may be required... -->'
578	print '</bibtex:file>'
579
580
581	def filehandler(filepath):
582	try:
583	fd = open(filepath, 'r')
584	filecontents_source = fd.readlines()
585	fd.close()
586	except:
587	print 'Could not open file:', filepath
588	return filecontents_source
589
590
591	# main program
592
593	def main():
594	import sys
595	if sys.argv[1:]:
596	filepath = sys.argv[1]
597	filecontents_source = filehandler(filepath)
598	else:
599	# instead of exit() read stdin here
600	filecontents_source = sys.stdin.readlines()
601	contentshandler(filecontents_source)
602
603	if __name__ == "__main__": main()
604
605
606	# end python script

Note: See TracBrowser for help on using the repository browser.

Download in other formats: