1 | #!/usr/bin/env python |
---|
2 | # -*- coding: utf-8 -*- |
---|
3 | # Time-stamp: "2006-07-26T09:50:29 vidar" |
---|
4 | """ |
---|
5 | Decoder for bibliographic data, BibTeX |
---|
6 | Usage: python bibtex2xml.py bibfile.bib > bibfile.xml |
---|
7 | |
---|
8 | (c) Vidar Bronken Gundersen, Sara Sprenkle |
---|
9 | http://bibtexml.sourceforge.net/ |
---|
10 | Reuse approved as long as this notification is kept. |
---|
11 | License: http://creativecommons.org/licenses/GPL/2.0/ |
---|
12 | |
---|
13 | Contributions/thanks to: |
---|
14 | Thomas Karl Schwaerzler, read stdin |
---|
15 | Egon Willighagen, http://jreferences.sf.net/ |
---|
16 | Richard Mahoney, for providing a test case |
---|
17 | |
---|
18 | This is Sara Sprenkle's rewrite of our original script, which |
---|
19 | is changed to be more robust and handle more bibtex features: |
---|
20 | 3. Allow spaces between @type and first { |
---|
21 | 4. 'author' fields with multiple authors split by ' and ' |
---|
22 | are put in separate xml 'bibtex:person' tags. |
---|
23 | 5. Option for Titles: words are capitalized |
---|
24 | only if first letter in title or capitalized inside braces |
---|
25 | 6. Removes braces from within field values |
---|
26 | 7. Ignores comments in bibtex file (including @comment{ or % ) |
---|
27 | 8. Replaces some special latex tags, e.g., replaces ~ with ' ' |
---|
28 | 9. Handles bibtex @string abbreviations |
---|
29 | --> includes bibtex's default abbreviations for months |
---|
30 | --> does concatenation of abbr # ' more ' and ' more ' # abbr |
---|
31 | 10. Handles @type( ... ) or @type{ ... } |
---|
32 | 11. The keywords field is split on , or ; and put into |
---|
33 | separate xml 'bibtex:keywords' tags |
---|
34 | 12. Ignores @preamble |
---|
35 | |
---|
36 | replace ':' with '-' for bibtex:entry@id: unique-ids cannot contain ':' |
---|
37 | |
---|
38 | Known Limitations |
---|
39 | 1. Does not transform Latex encoding like math mode |
---|
40 | and special latex symbols. |
---|
41 | 2. Does not parse author fields into first and last names. |
---|
42 | E.g., It does not do anything special to an author whose name is |
---|
43 | in the form LAST_NAME, FIRST_NAME In'author' tag, will show up as |
---|
44 | <bibtex:author>LAST_NAME, FIRST_NAME</bibtex:author> |
---|
45 | 3. Does not handle 'crossref' fields other than to print |
---|
46 | <bibtex:crossref>...</bibtex:crossref> |
---|
47 | 4. Does not inform user of the input's format errors. |
---|
48 | You just won't be able to transform the file later with XSL |
---|
49 | Create error.log file? |
---|
50 | |
---|
51 | 5. Special treatment of |
---|
52 | howpublished = '\url{http://www.cs.duke.edu/ari/crisp/}', |
---|
53 | |
---|
54 | 6. document functions with docstrings |
---|
55 | |
---|
56 | You will have to manually edit the XML output if you need to handle |
---|
57 | these (and unknown) limitations. |
---|
58 | |
---|
59 | """ |
---|
60 | |
---|
61 | import string, re |
---|
62 | |
---|
63 | # set of valid name characters |
---|
64 | valid_name_chars = '[\w\-:]' |
---|
65 | |
---|
66 | # define global regular expression variables |
---|
67 | author_rex = re.compile('\s+and\s+') |
---|
68 | rembraces_rex = re.compile('[{}]') |
---|
69 | capitalize_rex = re.compile('({\w*})') |
---|
70 | |
---|
71 | # used by bibtexkeywords(data) |
---|
72 | keywords_rex = re.compile('[,;]') |
---|
73 | |
---|
74 | # used by concat_line(line) |
---|
75 | concatsplit_rex = re.compile('\s*#\s*') |
---|
76 | |
---|
77 | # split on {, }, or " in verify_out_of_braces |
---|
78 | delimiter_rex = re.compile('([{}"])',re.I) |
---|
79 | |
---|
80 | field_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
---|
81 | data_rex = re.compile('\s*(\w*)\s*=\s*([^,]*),?') |
---|
82 | |
---|
83 | |
---|
84 | # |
---|
85 | # return the string parameter without braces |
---|
86 | # |
---|
87 | def removebraces(str): |
---|
88 | return rembraces_rex.sub('',str) |
---|
89 | |
---|
90 | # fix author so that it creates multiple authors, |
---|
91 | # split by "and" |
---|
92 | def bibtexauthor(data): |
---|
93 | bibtex = '<bibtex:author>' |
---|
94 | author_list = author_rex.split(data) |
---|
95 | if len(author_list) >1: |
---|
96 | bibtex = bibtex + '\n' |
---|
97 | for author in author_list: |
---|
98 | author = author.strip() |
---|
99 | bibtex = bibtex + '<bibtex:person>' + removebraces(author) + \ |
---|
100 | '</bibtex:person>' + '\n' |
---|
101 | else: bibtex = bibtex + removebraces(author_list[0]) |
---|
102 | bibtex = bibtex + '</bibtex:author>' |
---|
103 | return bibtex.strip() |
---|
104 | |
---|
105 | |
---|
106 | # @return the bibtex for the title |
---|
107 | # @param data --> title string |
---|
108 | # braces are removed from title |
---|
109 | def bibtextitle(data): |
---|
110 | title = removebraces(data) |
---|
111 | title = title.strip() |
---|
112 | bibtex = '<bibtex:title>' + title + \ |
---|
113 | '</bibtex:title>' |
---|
114 | return bibtex |
---|
115 | |
---|
116 | |
---|
117 | # @return the bibtex for the keyword |
---|
118 | # keywords are assumed to be delimited by , or ; |
---|
119 | def bibtexkeyword(data): |
---|
120 | bibtex = '' |
---|
121 | keyword_list = keywords_rex.split(data) |
---|
122 | for keyword in keyword_list: |
---|
123 | keyword = keyword.strip() |
---|
124 | bibtex = bibtex + '<bibtex:keywords>' + removebraces(keyword) \ |
---|
125 | + '</bibtex:keywords>' + '\n' |
---|
126 | return bibtex.strip() |
---|
127 | |
---|
128 | |
---|
129 | |
---|
130 | # data = title string |
---|
131 | # @return the capitalized title (first letter is capitalized), |
---|
132 | # rest are capitalized only if capitalized inside braces |
---|
133 | def capitalizetitle(data): |
---|
134 | title_list = capitalize_rex.split(data) |
---|
135 | title = '' |
---|
136 | count = 0 |
---|
137 | for phrase in title_list: |
---|
138 | check = string.lstrip(phrase) |
---|
139 | |
---|
140 | # keep phrase's capitalization the same |
---|
141 | if check.find('{') == 0: |
---|
142 | title = title + removebraces(phrase) |
---|
143 | else: |
---|
144 | # first word --> capitalize first letter (after spaces) |
---|
145 | if count == 0: |
---|
146 | title = title + check.capitalize() |
---|
147 | else: |
---|
148 | title = title + phrase.lower() |
---|
149 | count = count + 1 |
---|
150 | |
---|
151 | return title |
---|
152 | |
---|
153 | |
---|
154 | # |
---|
155 | # print the XML for the transformed "filecontents_source" |
---|
156 | # |
---|
157 | def bibtexdecoder(filecontents_source): |
---|
158 | filecontents = [] |
---|
159 | endentry = '' |
---|
160 | |
---|
161 | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
---|
162 | pubtype_rex = re.compile('@(\w*)\s*{\s*(.*),') |
---|
163 | endtype_rex = re.compile('}\s*$') |
---|
164 | endtag_rex = re.compile('^\s*}\s*$') |
---|
165 | |
---|
166 | #165,166c165,166 |
---|
167 | #< bracefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
---|
168 | #< bracedata_rex = re.compile('\s*(\w*)\s*=\s*{(.*)},?') |
---|
169 | #--- |
---|
170 | #> bracefield_rex = re.compile('\s*([^=\s]*)\s*=\s*(.*)') |
---|
171 | #> bracedata_rex = re.compile('\s*([^=\s]*)\s*=\s*{(.*)},?') |
---|
172 | |
---|
173 | bracefield_rex = re.compile('\s*([^=\s]*)\s*=\s*(.*)') |
---|
174 | bracedata_rex = re.compile('\s*([^=\s]*)\s*=\s*{(.*)},?') |
---|
175 | |
---|
176 | quotefield_rex = re.compile('\s*(\w*)\s*=\s*(.*)') |
---|
177 | quotedata_rex = re.compile('\s*(\w*)\s*=\s*"(.*)",?') |
---|
178 | |
---|
179 | for line in filecontents_source: |
---|
180 | line = line[:-1] |
---|
181 | |
---|
182 | # encode character entities |
---|
183 | line = string.replace(line, '&', '&') |
---|
184 | line = string.replace(line, '<', '<') |
---|
185 | line = string.replace(line, '>', '>') |
---|
186 | |
---|
187 | # start item: publication type (store for later use) |
---|
188 | if pubtype_rex.match(line): |
---|
189 | # want @<alphanumeric chars><spaces>{<spaces><any chars>, |
---|
190 | arttype = pubtype_rex.sub('\g<1>',line) |
---|
191 | arttype = string.lower(arttype) |
---|
192 | artid = pubtype_rex.sub('\g<2>', line) |
---|
193 | artid = string.replace(artid,':','-') |
---|
194 | endentry = '</bibtex:' + arttype + '>' + '\n</bibtex:entry>\n' |
---|
195 | line = '<bibtex:entry id="' + artid + '">\n' + \ |
---|
196 | '<bibtex:' + arttype + '>' |
---|
197 | # end item |
---|
198 | |
---|
199 | # end entry if just a } |
---|
200 | if endtype_rex.match(line): |
---|
201 | line = endtag_rex.sub(endentry, line) |
---|
202 | |
---|
203 | field = '' |
---|
204 | data = '' |
---|
205 | # field, publication info |
---|
206 | # field = {data} entries |
---|
207 | if bracedata_rex.match(line): |
---|
208 | field = bracefield_rex.sub('\g<1>', line) |
---|
209 | field = string.lower(field) |
---|
210 | data = bracedata_rex.sub('\g<2>', line) |
---|
211 | |
---|
212 | # field = "data" entries |
---|
213 | elif quotedata_rex.match(line): |
---|
214 | field = quotefield_rex.sub('\g<1>', line) |
---|
215 | field = string.lower(field) |
---|
216 | data = quotedata_rex.sub('\g<2>', line) |
---|
217 | |
---|
218 | # field = data entries |
---|
219 | elif data_rex.match(line): |
---|
220 | field = field_rex.sub('\g<1>', line) |
---|
221 | field = string.lower(field) |
---|
222 | data = data_rex.sub('\g<2>', line) |
---|
223 | |
---|
224 | if field == 'title': |
---|
225 | line = bibtextitle(data) |
---|
226 | elif field == 'author': |
---|
227 | line = bibtexauthor(data) |
---|
228 | elif field == 'keywords': |
---|
229 | line = bibtexkeyword(data) |
---|
230 | elif field != '': |
---|
231 | data = removebraces(data) |
---|
232 | data = string.strip(data) |
---|
233 | if data != '': |
---|
234 | line = '<bibtex:' + field + '>' + string.strip(data) + \ |
---|
235 | '</bibtex:' + field + '>' |
---|
236 | # get rid of the field={} type stuff |
---|
237 | else: |
---|
238 | line = '' |
---|
239 | |
---|
240 | if line != '': |
---|
241 | # latex-specific replacements |
---|
242 | # do this now after braces were removed |
---|
243 | line = string.replace(line, '~', ' ')#' ') |
---|
244 | line = string.replace(line, '\\\'a', 'á') |
---|
245 | line = string.replace(line, '\\"a', 'ä') |
---|
246 | line = string.replace(line, '\\\'c', 'ć') |
---|
247 | line = string.replace(line, '\\"o', 'ö') |
---|
248 | line = string.replace(line, '\\o', 'ø') |
---|
249 | line = string.replace(line, '\\"u', 'ü') |
---|
250 | line = string.replace(line, '---', '—') |
---|
251 | line = string.replace(line, '--', '-') |
---|
252 | |
---|
253 | filecontents.append(line) |
---|
254 | |
---|
255 | return filecontents |
---|
256 | |
---|
257 | # |
---|
258 | # return 1 iff abbr is in line but not inside braces or quotes |
---|
259 | # assumes that abbr appears only once on the line (out of braces and quotes) |
---|
260 | # |
---|
261 | def verify_out_of_braces(line, abbr): |
---|
262 | |
---|
263 | phrase_split = delimiter_rex.split(line) |
---|
264 | |
---|
265 | abbr_rex = re.compile( '\\b' + abbr + '\\b', re.I) |
---|
266 | |
---|
267 | open_brace = 0 |
---|
268 | open_quote = 0 |
---|
269 | |
---|
270 | for phrase in phrase_split: |
---|
271 | if phrase == "{": |
---|
272 | open_brace = open_brace + 1 |
---|
273 | elif phrase == "}": |
---|
274 | open_brace = open_brace - 1 |
---|
275 | elif phrase == '"': |
---|
276 | if open_quote == 1: |
---|
277 | open_quote = 0 |
---|
278 | else: |
---|
279 | open_quote = 1 |
---|
280 | elif abbr_rex.search(phrase): |
---|
281 | if open_brace == 0 and open_quote == 0: |
---|
282 | return 1 |
---|
283 | |
---|
284 | return 0 |
---|
285 | |
---|
286 | |
---|
287 | # |
---|
288 | # a line in the form phrase1 # phrase2 # ... # phrasen |
---|
289 | # is returned as phrase1 phrase2 ... phrasen |
---|
290 | # with the correct punctuation |
---|
291 | # Bug: Doesn't always work with multiple abbreviations plugged in |
---|
292 | # |
---|
293 | def concat_line(line): |
---|
294 | # only look at part after equals |
---|
295 | field = field_rex.sub('\g<1>',line) |
---|
296 | rest = field_rex.sub('\g<2>',line) |
---|
297 | |
---|
298 | concat_line = field + ' =' |
---|
299 | |
---|
300 | pound_split = concatsplit_rex.split(rest) |
---|
301 | |
---|
302 | phrase_count = 0 |
---|
303 | length = len(pound_split) |
---|
304 | |
---|
305 | for phrase in pound_split: |
---|
306 | phrase = phrase.strip() |
---|
307 | if phrase_count != 0: |
---|
308 | if phrase.startswith('"') or phrase.startswith('{'): |
---|
309 | phrase = phrase[1:] |
---|
310 | elif phrase.startswith('"'): |
---|
311 | phrase = phrase.replace('"','{',1) |
---|
312 | |
---|
313 | if phrase_count != length-1: |
---|
314 | if phrase.endswith('"') or phrase.endswith('}'): |
---|
315 | phrase = phrase[:-1] |
---|
316 | else: |
---|
317 | if phrase.endswith('"'): |
---|
318 | phrase = phrase[:-1] |
---|
319 | phrase = phrase + "}" |
---|
320 | elif phrase.endswith('",'): |
---|
321 | phrase = phrase[:-2] |
---|
322 | phrase = phrase + "}," |
---|
323 | |
---|
324 | # if phrase did have \#, add the \# back |
---|
325 | if phrase.endswith('\\'): |
---|
326 | phrase = phrase + "#" |
---|
327 | concat_line = concat_line + ' ' + phrase |
---|
328 | |
---|
329 | phrase_count = phrase_count + 1 |
---|
330 | |
---|
331 | return concat_line |
---|
332 | |
---|
333 | # substitute abbreviations into filecontents |
---|
334 | # @param filecontents_source - string of data from file |
---|
335 | def bibtex_replace_abbreviations(filecontents_source): |
---|
336 | filecontents = filecontents_source.splitlines() |
---|
337 | |
---|
338 | # These are defined in bibtex, so we'll define them too |
---|
339 | abbr_list = ['jan','feb','mar','apr','may','jun', |
---|
340 | 'jul','aug','sep','oct','nov','dec'] |
---|
341 | value_list = ['January','February','March','April', |
---|
342 | 'May','June','July','August','September', |
---|
343 | 'October','November','December'] |
---|
344 | |
---|
345 | abbr_rex = [] |
---|
346 | total_abbr_count = 0 |
---|
347 | |
---|
348 | front = '\\b' |
---|
349 | back = '(,?)\\b' |
---|
350 | |
---|
351 | for x in abbr_list: |
---|
352 | abbr_rex.append( re.compile(\ |
---|
353 | front + abbr_list[total_abbr_count] + back, re.I ) ) |
---|
354 | total_abbr_count = total_abbr_count + 1 |
---|
355 | |
---|
356 | |
---|
357 | abbrdef_rex = re.compile('\s*@string\s*{\s*('+\ |
---|
358 | valid_name_chars +'*)\s*=(.*)', re.I) |
---|
359 | |
---|
360 | comment_rex = re.compile('@comment\s*{',re.I) |
---|
361 | preamble_rex = re.compile('@preamble\s*{',re.I) |
---|
362 | |
---|
363 | waiting_for_end_string = 0 |
---|
364 | i = 0 |
---|
365 | filecontents2 = '' |
---|
366 | |
---|
367 | for line in filecontents: |
---|
368 | if line == ' ' or line == '': |
---|
369 | continue |
---|
370 | |
---|
371 | if waiting_for_end_string: |
---|
372 | if re.search('}',line): |
---|
373 | waiting_for_end_string = 0 |
---|
374 | continue |
---|
375 | |
---|
376 | if abbrdef_rex.search(line): |
---|
377 | abbr = abbrdef_rex.sub('\g<1>', line) |
---|
378 | |
---|
379 | if abbr_list.count(abbr) == 0: |
---|
380 | val = abbrdef_rex.sub('\g<2>', line) |
---|
381 | abbr_list.append(abbr) |
---|
382 | value_list.append(string.strip(val)) |
---|
383 | abbr_rex.append( re.compile(\ |
---|
384 | front + abbr_list[total_abbr_count] + back, re.I ) ) |
---|
385 | total_abbr_count = total_abbr_count + 1 |
---|
386 | waiting_for_end_string = 1 |
---|
387 | continue |
---|
388 | |
---|
389 | if comment_rex.search(line): |
---|
390 | waiting_for_end_string = 1 |
---|
391 | continue |
---|
392 | |
---|
393 | if preamble_rex.search(line): |
---|
394 | waiting_for_end_string = 1 |
---|
395 | continue |
---|
396 | |
---|
397 | |
---|
398 | # replace subsequent abbreviations with the value |
---|
399 | abbr_count = 0 |
---|
400 | |
---|
401 | for x in abbr_list: |
---|
402 | |
---|
403 | if abbr_rex[abbr_count].search(line): |
---|
404 | if verify_out_of_braces(line,abbr_list[abbr_count]) == 1: |
---|
405 | line = abbr_rex[abbr_count].sub(\ |
---|
406 | value_list[abbr_count] + '\g<1>', line) |
---|
407 | # Check for # concatenations |
---|
408 | if concatsplit_rex.search(line): |
---|
409 | line = concat_line(line) |
---|
410 | abbr_count = abbr_count + 1 |
---|
411 | |
---|
412 | |
---|
413 | filecontents2 = filecontents2 + line + '\n' |
---|
414 | i = i+1 |
---|
415 | |
---|
416 | |
---|
417 | # Do one final pass over file |
---|
418 | |
---|
419 | # make sure that didn't end up with {" or }" after the substitution |
---|
420 | filecontents2 = filecontents2.replace('{"','{{') |
---|
421 | filecontents2 = filecontents2.replace('"}','}}') |
---|
422 | |
---|
423 | afterquotevalue_rex = re.compile('"\s*,\s*') |
---|
424 | afterbrace_rex = re.compile('"\s*}') |
---|
425 | afterbracevalue_rex = re.compile('(=\s*{[^=]*)},\s*') |
---|
426 | |
---|
427 | # add new lines to data that changed because of abbreviation substitutions |
---|
428 | filecontents2 = afterquotevalue_rex.sub('",\n', filecontents2) |
---|
429 | filecontents2 = afterbrace_rex.sub('"\n}', filecontents2) |
---|
430 | filecontents2 = afterbracevalue_rex.sub('\g<1>},\n', filecontents2) |
---|
431 | |
---|
432 | return filecontents2 |
---|
433 | |
---|
434 | # |
---|
435 | # convert @type( ... ) to @type{ ... } |
---|
436 | # |
---|
437 | def no_outer_parens(filecontents): |
---|
438 | |
---|
439 | # do checking for open parens |
---|
440 | # will convert to braces |
---|
441 | paren_split = re.split('([(){}])',filecontents) |
---|
442 | |
---|
443 | open_paren_count = 0 |
---|
444 | open_type = 0 |
---|
445 | look_next = 0 |
---|
446 | |
---|
447 | # rebuild filecontents |
---|
448 | filecontents = '' |
---|
449 | |
---|
450 | at_rex = re.compile('@\w*') |
---|
451 | |
---|
452 | for phrase in paren_split: |
---|
453 | if look_next == 1: |
---|
454 | if phrase == '(': |
---|
455 | phrase = '{' |
---|
456 | open_paren_count = open_paren_count + 1 |
---|
457 | else: |
---|
458 | open_type = 0 |
---|
459 | look_next = 0 |
---|
460 | |
---|
461 | if phrase == '(': |
---|
462 | open_paren_count = open_paren_count + 1 |
---|
463 | |
---|
464 | elif phrase == ')': |
---|
465 | open_paren_count = open_paren_count - 1 |
---|
466 | if open_type == 1 and open_paren_count == 0: |
---|
467 | phrase = '}' |
---|
468 | open_type = 0 |
---|
469 | |
---|
470 | elif at_rex.search( phrase ): |
---|
471 | open_type = 1 |
---|
472 | look_next = 1 |
---|
473 | |
---|
474 | filecontents = filecontents + phrase |
---|
475 | |
---|
476 | return filecontents |
---|
477 | |
---|
478 | |
---|
479 | # make all whitespace into just one space |
---|
480 | # format the bibtex file into a usable form. |
---|
481 | def bibtexwasher(filecontents_source): |
---|
482 | |
---|
483 | space_rex = re.compile('\s+') |
---|
484 | comment_rex = re.compile('\s*%') |
---|
485 | |
---|
486 | filecontents = [] |
---|
487 | |
---|
488 | # remove trailing and excessive whitespace |
---|
489 | # ignore comments |
---|
490 | for line in filecontents_source: |
---|
491 | line = string.strip(line) |
---|
492 | line = space_rex.sub(' ', line) |
---|
493 | # ignore comments |
---|
494 | if not comment_rex.match(line): |
---|
495 | filecontents.append(' '+ line) |
---|
496 | |
---|
497 | filecontents = string.join(filecontents, '') |
---|
498 | |
---|
499 | # the file is in one long string |
---|
500 | |
---|
501 | filecontents = no_outer_parens(filecontents) |
---|
502 | |
---|
503 | # |
---|
504 | # split lines according to preferred syntax scheme |
---|
505 | # |
---|
506 | filecontents = re.sub('(=\s*{[^=]*)},', '\g<1>},\n', filecontents) |
---|
507 | |
---|
508 | # add new lines after commas that are after values |
---|
509 | filecontents = re.sub('"\s*,', '",\n', filecontents) |
---|
510 | filecontents = re.sub('=\s*([\w\d]+)\s*,', '= \g<1>,\n', filecontents) |
---|
511 | filecontents = re.sub('(@\w*)\s*({(\s*)[^,\s]*)\s*,', |
---|
512 | '\n\n\g<1>\g<2>,\n', filecontents) |
---|
513 | |
---|
514 | # add new lines after } |
---|
515 | filecontents = re.sub('"\s*}','"\n}\n', filecontents) |
---|
516 | filecontents = re.sub('}\s*,','},\n', filecontents) |
---|
517 | |
---|
518 | |
---|
519 | filecontents = re.sub('@(\w*)', '\n@\g<1>', filecontents) |
---|
520 | |
---|
521 | # character encoding, reserved latex characters |
---|
522 | filecontents = re.sub('{\\\&}', '&', filecontents) |
---|
523 | filecontents = re.sub('\\\&', '&', filecontents) |
---|
524 | |
---|
525 | # do checking for open braces to get format correct |
---|
526 | open_brace_count = 0 |
---|
527 | brace_split = re.split('([{}])',filecontents) |
---|
528 | |
---|
529 | # rebuild filecontents |
---|
530 | filecontents = '' |
---|
531 | |
---|
532 | for phrase in brace_split: |
---|
533 | if phrase == '{': |
---|
534 | open_brace_count = open_brace_count + 1 |
---|
535 | elif phrase == '}': |
---|
536 | open_brace_count = open_brace_count - 1 |
---|
537 | if open_brace_count == 0: |
---|
538 | filecontents = filecontents + '\n' |
---|
539 | |
---|
540 | filecontents = filecontents + phrase |
---|
541 | |
---|
542 | filecontents2 = bibtex_replace_abbreviations(filecontents) |
---|
543 | |
---|
544 | # gather |
---|
545 | filecontents = filecontents2.splitlines() |
---|
546 | i=0 |
---|
547 | j=0 # count the number of blank lines |
---|
548 | for line in filecontents: |
---|
549 | # ignore blank lines |
---|
550 | if line == '' or line == ' ': |
---|
551 | j = j+1 |
---|
552 | continue |
---|
553 | filecontents[i] = line + '\n' |
---|
554 | i = i+1 |
---|
555 | |
---|
556 | # get rid of the extra stuff at the end of the array |
---|
557 | # (The extra stuff are duplicates that are in the array because |
---|
558 | # blank lines were removed.) |
---|
559 | length = len( filecontents) |
---|
560 | filecontents[length-j:length] = [] |
---|
561 | |
---|
562 | return filecontents |
---|
563 | |
---|
564 | |
---|
565 | def contentshandler(filecontents_source): |
---|
566 | washeddata = bibtexwasher(filecontents_source) |
---|
567 | outdata = bibtexdecoder(washeddata) |
---|
568 | print '<?xml version="1.0" encoding="utf-8"?>' |
---|
569 | #print '<?xml-stylesheet href="bibtexml.css" type="text/css" ?>' |
---|
570 | print '<!DOCTYPE bibtex:file PUBLIC' |
---|
571 | print ' "-//BibTeXML//DTD XML for BibTeX v1.0//EN"' |
---|
572 | print ' "bibtexml.dtd" >' |
---|
573 | print '<bibtex:file xmlns:bibtex="http://bibtexml.sf.net/">' |
---|
574 | print |
---|
575 | for line in outdata: |
---|
576 | print line |
---|
577 | print ' <!-- manual cleanup may be required... -->' |
---|
578 | print '</bibtex:file>' |
---|
579 | |
---|
580 | |
---|
581 | def filehandler(filepath): |
---|
582 | try: |
---|
583 | fd = open(filepath, 'r') |
---|
584 | filecontents_source = fd.readlines() |
---|
585 | fd.close() |
---|
586 | except: |
---|
587 | print 'Could not open file:', filepath |
---|
588 | return filecontents_source |
---|
589 | |
---|
590 | |
---|
591 | # main program |
---|
592 | |
---|
593 | def main(): |
---|
594 | import sys |
---|
595 | if sys.argv[1:]: |
---|
596 | filepath = sys.argv[1] |
---|
597 | filecontents_source = filehandler(filepath) |
---|
598 | else: |
---|
599 | # instead of exit() read stdin here |
---|
600 | filecontents_source = sys.stdin.readlines() |
---|
601 | contentshandler(filecontents_source) |
---|
602 | |
---|
603 | if __name__ == "__main__": main() |
---|
604 | |
---|
605 | |
---|
606 | # end python script |
---|