Package qm :: Module structured_text
[hide private]
[frames] | no frames]

Source Code for Module qm.structured_text

  1  ######################################################################## 
  2  # 
  3  # File:   structured_text.py 
  4  # Author: Alex Samuel 
  5  # Date:   2001-03-04 
  6  # 
  7  # Contents: 
  8  #   Code for processing structured text. 
  9  # 
 10  # Copyright (c) 2001, 2002 by CodeSourcery, LLC.  All rights reserved.  
 11  # 
 12  # For license terms see the file COPYING. 
 13  # 
 14  ######################################################################## 
 15   
 16  """Functions for parsing, processing, and formatting Python structured text. 
 17   
 18  See "Structured Text Formatting 
 19  Rules":http://www.python.org/sigs/doc-sig/stext.html for more 
 20  information. 
 21  """ 
 22   
 23  ######################################################################## 
 24  # imports 
 25  ######################################################################## 
 26   
 27  import cStringIO 
 28  import htmlentitydefs 
 29  import re 
 30  import string 
 31  import sys 
 32   
 33  ######################################################################## 
 34  # constants 
 35  ######################################################################## 
 36   
 37  html_help_text = ''' 
 38  <h4>Structured Text</h4> 
 39   
 40  <p>Structured text is a simple set of conventions for formatting 
 41  ordinary text.  Usually, you can simply type ordinary text where QM 
 42  expects structured text; the resulting output will be line-wrapped, with 
 43  paragraph breaks indicated by blank lines.</p> 
 44   
 45  <p>Structured text also provides simple ways of adding elements such as 
 46  bulleted and numbered lists, bold and italics text, monospaced text, and 
 47  hyperlinks to ordinary text.  For example, to obtain bold text, place a 
 48  pair of asterisks on either side of it.  Thus, 
 49  <blockquote><pre> 
 50  **hello, there** 
 51  </pre></blockquote> 
 52  is displayed as 
 53  <blockquote> 
 54  <b>hello, there</b> 
 55  </blockquote>Structured text may be displayed as HTML, as plain text, 
 56  or in other formats.  The exact format of the output will depend on the 
 57  capabilities of the output system.</p> 
 58   
 59  <p>For a complete description of structured text rules, see the 
 60  <a href="http://www.python.org/sigs/doc-sig/stext.html">Structured Text 
 61  Formatting Rules</a>.</p> 
 62  ''' 
 63   
 64  ######################################################################## 
 65  # classes 
 66  ######################################################################## 
 67   
68 -class Formatter:
69 """Interface for output formatters for the 'StructuredTextProcessor'. 70 71 Valid list environment types are 72 73 * definition list 74 75 * ordered list 76 77 * paragraph 78 79 * unordered list 80 81 Valid styles are 82 83 * emphasized 84 85 * strong 86 87 * underlined 88 89 * literal 90 91 * verbatim 92 93 """ 94 95 pass
96 97 98
99 -class TextFormatter(Formatter):
100 """Formatter for generating plain text from structured text.""" 101 102 __style_markers = { 103 "emphasized" : "*", 104 "strong" : "**", 105 "underlined" : "_", 106 "literal" : "'", 107 "verbatim" : "'''", 108 } 109 110
111 - def __init__(self, 112 output_file=sys.stdout, 113 width=78, 114 indent_size=2, 115 indent=0, 116 list_bullet="-"):
117 """Create a new HTML formatter. 118 119 'output_file' -- A file object to which HTML source is 120 written.""" 121 122 self.__output_file = output_file 123 self.__width = width 124 self.__col = 0 125 self.__indent = indent 126 self.__indent_size = indent_size 127 self.__list_bullet = list_bullet 128 self.__list_depth = 0 129 self.__current_link_target = None 130 self.__link_targets = []
131 132
133 - def End(self):
134 """End the processed text document.""" 135 136 # If there were any hyperlink references placed, we need to list 137 # the link targets at the end of the document. 138 if self.__link_targets: 139 self.__NextLine() 140 for index in range(0, len(self.__link_targets)): 141 # Print the reference number and link target, one to a 142 # line. 143 target = self.__link_targets[index] 144 self.WriteText("[%d] %s" % (index + 1, target)) 145 self.__NextLine()
146 147
148 - def WriteText(self, text):
149 """Write ordinary text.""" 150 151 # Split the text into words. Use 're.split' and grouping 152 # around the separator so that the resulting list contains 153 # elements for the separators, too. 154 words = re.split("( )", text) 155 # Remove empty strings. 156 words = filter(None, words) 157 # Loop over words. 158 start_of_line = 0 159 for word in words: 160 # Does this word fit on the line? 161 if self.__col + len(word) > self.__width: 162 # No. Go to the next line. 163 self.__NextLine() 164 self.__IndentTo(self.__indent) 165 start_of_line = 1 166 # Are we at the beginning of a line? 167 if start_of_line: 168 if string.strip(word) == "": 169 # Don't print spaces at the start of a line. 170 continue 171 else: 172 # No longer. 173 start_of_line = 0 174 # Write the word. 175 self.__Write(word)
176 177
178 - def StartList(self, type):
179 """Start a list environment of type 'type'.""" 180 181 # Bump up indentation for paragraphs, except for the outermost 182 # level. 183 if type == "paragraph" and self.__list_depth > 0: 184 self.__indent = self.__indent + self.__indent_size 185 # Keep track of the nesting depth of lists. 186 self.__list_depth = self.__list_depth + 1
187 188
189 - def EndList(self, type):
190 """End a list environment of type 'type'.""" 191 192 # Keep track of the nesting depth of lists. 193 self.__list_depth = self.__list_depth - 1 194 # Bump back indentation when ending paragraph lists, except for 195 # the outermost level. 196 if type == "paragraph" and self.__list_depth > 0: 197 self.__indent = self.__indent - self.__indent_size
198 199
200 - def StartItem(self, type, label=None):
201 """Begin an element to the environment of type 'type'. 202 203 'label' -- If type is "ordered list", this is the label for 204 this list element.""" 205 206 self.__IndentTo(self.__indent) 207 # For list items, emit the appopriate item tag. 208 if type == "ordered list": 209 self.__Write("%s " % label) 210 elif type == "unordered list": 211 self.__Write("%s " % self.__list_bullet) 212 elif type == "definition list": 213 pass
214 215
216 - def FinishDefinedTerm(self):
217 """Finish the definition of a term in a definition list.""" 218 219 self.__Write(" -- ");
220 221
222 - def EndItem(self, type):
223 """End an element in the environment of type 'type'.""" 224 225 if type == "paragraph": 226 # End a paragraph. End this line if we've started writing 227 # on it. 228 if self.__col > self.__indent: 229 self.__NextLine() 230 # Skip another line. 231 self.__NextLine()
232 233
234 - def StartStyle(self, style):
235 """Start a new text style 'style'.""" 236 237 self.__Write(self.__style_markers[style])
238 239
240 - def EndStyle(self, style):
241 """End the text style 'style'.""" 242 243 self.__Write(self.__style_markers[style])
244 245 254 255 276 277 278 # Helper methods. 279
280 - def __IndentTo(self, col):
281 if col > self.__col: 282 self.__Write(" " * (col - self.__col))
283 284
285 - def __Write(self, text):
286 self.__output_file.write(text) 287 self.__col = self.__col + len(text)
288 289
290 - def __NextLine(self):
291 self.__Write("\n") 292 self.__col = 0
293 294 295
296 -class HtmlFormatter(Formatter):
297 """Formatter for generating HTML from structured text.""" 298 299 __start_list_tags = { 300 "definition list": "<dl>\n", 301 "ordered list": "<ol>\n", 302 "paragraph": "", 303 "unordered list": "<ul>\n", 304 "verbatim": "", 305 } 306 307 __end_list_tags = { 308 "definition list": "</dl>\n", 309 "ordered list": "</ol>\n", 310 "paragraph": "", 311 "unordered list": "</ul>\n", 312 "verbatim": "", 313 } 314 315 __start_item_tags = { 316 "definition list": "<dt>", 317 "ordered list": "<li>\n", 318 "paragraph": "<p>", 319 "unordered list": "<li>\n", 320 "verbatim": "", 321 } 322 323 __end_item_tags = { 324 "definition list": "</dd>\n", 325 "ordered list": "</li>\n", 326 "paragraph": "</p>\n", 327 "unordered list": "</li>\n", 328 "verbatim": "", 329 } 330 331 __start_style_tags = { 332 "emphasized": "<em>", 333 "strong": "<strong>", 334 "underlined": "<u>", 335 "literal": "<tt>", 336 "verbatim": '<pre>\'<span class="verbatim">', 337 } 338 339 __end_style_tags = { 340 "emphasized": "</em>", 341 "strong": "</strong>", 342 "underlined": "</u>", 343 "literal": "</tt>", 344 "verbatim": '</span>\'</pre>', 345 } 346 347
348 - def __init__(self, output_file=sys.stdout):
349 """Create a new HTML formatter. 350 351 'output_file' -- A file object to which HTML source is 352 written.""" 353 354 self.__output_file = output_file
355 356
357 - def End(self):
358 """End the processed text document.""" 359 360 pass
361 362
363 - def WriteText(self, text):
364 """Write ordinary text.""" 365 366 text = escape_html_entities(text) 367 self.__Write(text)
368 369
370 - def StartList(self, type):
371 """Start a list environment of type 'type'.""" 372 373 self.__Write(self.__start_list_tags[type])
374 375
376 - def EndList(self, type):
377 """End a list environment of type 'type'.""" 378 379 self.__Write(self.__end_list_tags[type])
380 381
382 - def StartItem(self, type, label=None):
383 """Begin an element to the environment of type 'type'. 384 385 'label' -- If type is "ordered list", this is the label for 386 this list element.""" 387 388 self.__Write(self.__start_item_tags[type])
389 390
391 - def FinishDefinedTerm(self):
392 """Finish the definition of a term in a definition list.""" 393 394 self.__Write("</dt><dd>\n");
395 396
397 - def EndItem(self, type):
398 """End an element in the environment of type 'type'.""" 399 400 self.__Write(self.__end_item_tags[type])
401 402
403 - def StartStyle(self, style):
404 """Start a new text style 'style'.""" 405 406 self.__Write(self.__start_style_tags[style])
407 408
409 - def EndStyle(self, style):
410 """End the text style 'style'.""" 411 412 self.__Write(self.__end_style_tags[style])
413 414 419 420 425 426 427 # Helper methods. 428
429 - def __Write(self, text):
430 self.__output_file.write(text)
431 432 433
434 -class StructuredTextProcessor:
435 """Parser and formatter for Python structured text.""" 436 437 # Regex fragment matching a single punctuation or space character. 438 __punctuation = "[%s]" % "][)(.,!?;:'\" " 439 440 # Regex matching a list bullet at the start of the line. 441 __bullet_regex = re.compile("^[-o*] +") 442 443 # Regex matching a sequence label at the start of the line. 444 __sequence_regex = re.compile("^([A-Za-z]+\.|[0-9]+\.?)+ +") 445 446 # Regex matching a definition label at the start of the line. 447 # Group 1 is the defined term. 448 __definition_regex = re.compile("^(.*) +-- +") 449 450 # Regex matching newslines plus any spaces on either side. 451 __collapse_regex = re.compile(" *\n *", re.MULTILINE) 452 453 # Regex matching indentation at the beginning of a line. 454 __indent_regex = re.compile("^ *") 455 456 # Regex matching single-quoted literal text. Group 1 is leading 457 # spaces; group 2 is the verbatim text; group 3 is trailing spaces 458 # and/or punctuation. 459 __literal_regex = re.compile("( +|^)'([^']+)'(%s+|$)" % __punctuation) 460 461 # Regex matching emphasized text. Group 1 is leading spaces; 462 # group 2 is the verbatim text; group 3 is trailing spaces and/or 463 # punctuation. 464 __strong_regex = re.compile("( +|^)\*\*([^*]+)\*\*(%s+|$)" % __punctuation) 465 466 # Regex matching strong text. Group 1 is leading spaces; group 2 467 # is the verbatim text; group 3 is trailing spaces and/or 468 # punctuation. 469 __emph_regex = re.compile("( +|^)\*([^*]+)\*(%s+|$)" % __punctuation) 470 471 # Regex matching underlined text. Group 1 is leading spaces; 472 # group 2 is the verbatim text; group 3 is trailing spaces and/or 473 # punctuation. 474 __underline_regex = re.compile("( +|^)_([^_]+)_(%s+|$)" % __punctuation) 475 476 # Regex matching double-quoted text that may be a hyperlink. If 477 # there is a matching link footnote, the contents of the double 478 # quotes, group 1, is a hyperlink. 479 __link_regex = re.compile('"([^"]*)"') 480 481 # Regex matching hyperlink footnotes. Group one is the link text; 482 # group 2 is the link target URL. 483 __link_footnote_regex = re.compile('\n\\.\\. *"([^"]*)" *([^ \n]*)[^\n]*') 484 485 # List types which may not include other environments nested 486 # inside their items. 487 __non_nestable_types = [ 488 "paragraph", 489 ] 490 491
492 - def __init__(self, formatter):
493 """Create a new structured text processor. 494 495 'formatter' -- The formatter to use to generate output.""" 496 497 self.__stack = [] 498 self.__formatter = formatter 499 self.__hyperlinks = {}
500 501
502 - def NormalizeSpaces(self, text):
503 """Return 'text' with spaces normalized.""" 504 505 # Convert tabs to spaces. 506 text = string.replace(text, "\t", " ") 507 # Normalize leading and trailing whitespace. 508 return string.strip(text) + " "
509 510
511 - def __call__(self, text):
512 """Process structured text 'text'.""" 513 514 # Look for hyperlink footnotes, and build a map of hyperlinked 515 # phrases. Keep track of where the last match was. 516 position = 0 517 while position < len(text): 518 # Look for the next hyperlink footnote match. 519 match = self.__link_footnote_regex.search(text[position:]) 520 if match is None: 521 # No more; all done. 522 break 523 else: 524 # Record the hyperlink. 525 link_text = string.strip(match.group(1)) 526 link_target = match.group(2) 527 self.__hyperlinks[link_text] = link_target 528 # Remove the footnote from the text. 529 text = text[:match.start() + position] \ 530 + text[match.end() + position:] 531 # Next, try searching from the text following the match. 532 position = match.start() 533 534 # Split text into paragraphs. 535 paragraphs = get_paragraphs(text) 536 537 # Loop over paragraphs. 538 for paragraph in paragraphs: 539 # If this is a verbatim paragraph, handle it specially. 540 match = _verbatim_regexp.match(paragraph) 541 if match: 542 if self.__stack: 543 indentation = self.__stack[-1][1] 544 else: 545 indentation = 0 546 self.__SetType("verbatim", indentation) 547 self.__formatter.StartStyle("verbatim") 548 self.__formatter.WriteText(match.group(1)[3:-3]) 549 self.__formatter.EndStyle("verbatim") 550 continue 551 # Extract indentations for all the lines in the paragraph. 552 indents = self.__indent_regex.findall(paragraph) 553 # The paragraph's indentation is the minimum indentation 554 # of its lines. 555 indentation = min(map(len, indents)) 556 # Trim indentation from the first line. 557 paragraph = paragraph[indentation:] 558 559 # Skip empty paragraphs. 560 if paragraph == "": 561 continue 562 563 # Grab the first line of the paragraph. 564 first_line = string.split(paragraph, "\n", 1)[0] 565 566 # Does it look like a bullet (unordered) list item? 567 match = self.__bullet_regex.match(first_line) 568 if match is not None: 569 # Yes. Put the formatter into an unordered list 570 # environment. 571 self.__SetType("unordered list", indentation) 572 # Cut off the bullet, and use the indentation of the 573 # text itself. 574 match_length = len(match.group(0)) 575 indentation = indentation + match_length 576 paragraph = paragraph[match_length:] 577 else: 578 # Does it look like a sequence label of an ordered list? 579 match = self.__sequence_regex.match(first_line) 580 if match is not None: 581 # Yes. Put the formatter into an ordered list 582 # environment. 583 self.__SetType("ordered list", indentation, 584 label=match.group(1)) 585 # Cut off the label, and use the indentation of 586 # the text itself. 587 match_length = len(match.group(0)) 588 indentation = indentation + match_length 589 paragraph = paragraph[match_length:] 590 else: 591 match = self.__definition_regex.match(first_line) 592 # Does it look like a definition list item? 593 if match is not None: 594 # Yes. Put the formatter into a definition 595 # list environment. 596 self.__SetType("definition list", indentation, 597 label=match.group(1)) 598 # Cut off the defined term label, and use the 599 # indentation of the definition. 600 match_length = len(match.group(0)) 601 indentation = indentation + match_length 602 paragraph = paragraph[match_length:] 603 604 # Collapse the remaining paragraph into a single line of 605 # text by replacing newlines with spaces. 606 paragraph = self.__collapse_regex.sub(" ", paragraph) 607 # Clean up spacing. 608 paragraph = self.NormalizeSpaces(paragraph) 609 # Now generate a paragraph for the rest of the text. 610 self.__SetType("paragraph", indentation) 611 self.__WriteText(paragraph)
612 613
614 - def End(self):
615 """Stop processing text, and do any necessary cleanup.""" 616 617 # Pop out of any remaining environments. 618 while self.__stack: 619 top_type, top_indentation = self.__stack[-1] 620 # End the item. 621 self.__formatter.EndItem(top_type) 622 # End the environment. 623 self.__PopType() 624 # Finish up the formatter. 625 self.__formatter.End()
626 627 628 # Helper methods. 629
630 - def __PushType(self, type, indentation):
631 """Start a new environment.""" 632 633 # The innermost environment may be of a type that cannot 634 # contain nested environments in its items. If that's the 635 # case, end the item here. 636 if len(self.__stack) > 0: 637 top_type, top_indentation = self.__stack[-1] 638 if top_type in self.__non_nestable_types: 639 self.__formatter.EndItem(top_type) 640 # Start te environment. 641 self.__formatter.StartList(type) 642 # Push it onto the stack. 643 self.__stack.append((type, indentation))
644 645
646 - def __PopType(self):
647 """End and remove the innermost environment.""" 648 649 # Get the topmost environment on the stack. 650 top_type, top_indentation = self.__stack[-1] 651 # End the environment. 652 self.__formatter.EndList(top_type) 653 # Remove it from the stack. 654 self.__stack.pop() 655 # The new innermost environment may be of a type that cannot 656 # contain nested environments. If it is, then we 657 # (prematurely) ended an item when we opened the environment 658 # that just closed. We'll have to open a new item here. 659 if len(self.__stack) > 0: 660 top_type, top_indentation = self.__stack[-1] 661 if top_type in self.__non_nestable_types: 662 self.__formatter.StartItem(top_type)
663 664
665 - def __SetType(self, type, indentation, label=None):
666 """Set the environment type and indentation level.""" 667 668 while 1: 669 # Look at the current innermost environment (if there is 670 # eone). 671 if len(self.__stack) == 0: 672 top_indentation = -1 673 else: 674 top_type, top_indentation = self.__stack[-1] 675 676 # Are we outdented from the current environment and 677 # indentation level, or at the same indentation? 678 if indentation <= top_indentation: 679 # End the previous item. 680 self.__formatter.EndItem(top_type) 681 if indentation < top_indentation: 682 # We're outdented, so end the previous environment. 683 self.__PopType() 684 elif top_type != type: 685 # Same indentation but different environment type. 686 # End the previous environment, and start a new 687 # one. 688 self.__PopType() 689 self.__PushType(type, indentation) 690 else: 691 # Same indentation, same environment. We just 692 # need a new item, so fall through. 693 break 694 else: 695 # We're indented. Nest a new environment in the 696 # current item. 697 self.__PushType(type, indentation) 698 break 699 700 # Start a new item in the current environment. 701 self.__formatter.StartItem(type, label) 702 if type == "definition list": 703 self.__WriteText(label) 704 self.__formatter.FinishDefinedTerm()
705 706
707 - def __WriteText(self, text):
708 """Write paragraph text.""" 709 710 # Look for various types of markup for special formatting for 711 # a range of text. 712 for regex, style in [ 713 (self.__literal_regex, "literal"), 714 (self.__strong_regex, "strong"), 715 (self.__emph_regex, "emphasized"), 716 (self.__underline_regex, "underlined"), 717 ]: 718 # Find the first match. 719 match = regex.search(text) 720 if match is not None: 721 # Found a match. Recursively format everything up to 722 # the start of the match. 723 self.__WriteText(text[:match.end(1)]) 724 # Start generating text in the indicated style. 725 self.__formatter.StartStyle(style) 726 # If it's a literal style, push the literal text out 727 # directly. Otherwise, format it recursively. 728 if style == "literal" or style == "verbatim": 729 self.__formatter.WriteText(match.group(2)) 730 else: 731 self.__WriteText(match.group(2)) 732 # Stop generating text in the specified style. 733 self.__formatter.EndStyle(style) 734 # Recursively format everything following the match. 735 self.__WriteText(text[match.start(3):]) 736 return 737 738 # Look for hyperlink markup. 739 match = self.__link_regex.search(text) 740 if match is not None: 741 link_text = string.strip(match.group(1)) 742 # Is there a footnote providing a link target for this 743 # phrase? 744 if self.__hyperlinks.has_key(link_text): 745 # Yes. Emit a hyperlink. 746 link_target = self.__hyperlinks[link_text] 747 # Recursively format everything up to the start of the 748 # match. 749 self.__WriteText(text[:match.start(0)]) 750 # Generate the start of the link. 751 self.__formatter.StartLink(link_target) 752 # Recursively format the link text. 753 self.__WriteText(match.group(1)) 754 # End the link. 755 self.__formatter.EndLink() 756 # Recursively format everything following the match. 757 self.__WriteText(text[match.end(1) + 1:]) 758 return 759 else: 760 # Fall through and format the entire text as usual. 761 pass 762 763 # Nothing special. Write ordinary text. 764 self.__formatter.WriteText(text)
765 766 767 ######################################################################## 768 # functions 769 ######################################################################## 770
771 -def escape_html_entities(text):
772 """Return 'text' with special characters converted to HTML entities.""" 773 774 return __entity_char_regex.sub(__entity_char_replacement, text)
775 776
777 -def __format(text, formatter):
778 """Process structured text 'text' with 'formatter'.""" 779 780 processor = StructuredTextProcessor(formatter) 781 processor(text) 782 processor.End()
783 784
785 -def to_html(structured_text):
786 """Return 'structured_text' formatted as HTML.""" 787 788 # Create an HTML formatter that dumps its output to a StringIO. 789 output_string = cStringIO.StringIO() 790 formatter = HtmlFormatter(output_string) 791 # Generate output. 792 __format(structured_text, formatter) 793 # Return the resulting text. 794 return output_string.getvalue()
795 796
797 -def to_text(structured_text, width=78, indent=0):
798 """Return 'structured_text' formatted as plain text. 799 800 'width' -- The width of the text (including the indentation). 801 802 'indent' -- The width of the block indentation of the formatted 803 output.""" 804 805 # Create a text formatter that dumps its output to a StringIO. 806 output_string = cStringIO.StringIO() 807 formatter = TextFormatter(output_string, width=width, indent=indent) 808 # Generate output. 809 __format(structured_text, formatter) 810 # Return the resulting text. 811 return output_string.getvalue()
812 813
814 -def get_first(structured_text):
815 """Return the first line of 'structured_text'. 816 817 By convention, the first line of a structured text description is a 818 short summary.""" 819 820 return string.split(structured_text, "\n", 1)[0]
821 822
823 -def get_rest(structured_text):
824 """Return the contents of 'structured_text' minus the first line.""" 825 826 parts = string.split(structured_text, "\n", 1) 827 # There may not be more than one line; handle this gracefully. 828 if len(parts) > 0: 829 return parts[1] 830 else: 831 return ""
832 833
834 -def get_paragraphs(structured_text):
835 """Split 'structured_text' into paragraphs. 836 837 'structured_text' -- A string consisting of structured text. 838 839 returns -- A sequence of pagraphs of structured text. Each 840 element in the sequence corresponds to a successive pagraph 841 in the 'structured_text'. If 'structured_text' is the empty 842 string, the sequence returned will consist of a single 843 paragraph, itself empty.""" 844 845 # There are no paragraphs yet. 846 paragraphs = [] 847 # The first paragraph begins at the first character. 848 begin = 0 849 # We have not yet found the end of the paragraph. 850 end = 0 851 # Keep going until there is no more text. 852 while end < len(structured_text): 853 # If we are at the start of a paragraph, check to see if 854 # we might be looking at a piece of verbatim text. 855 if (len(structured_text) - end >= 6 856 and structured_text[end:end+3] == "'''"): 857 end = string.find(structured_text, "'''", end + 3) 858 if end > 0: 859 end = end + 3 860 # Add the new paragraph to the ist. 861 paragraphs.append(structured_text[begin:end]) 862 begin = end 863 continue 864 else: 865 # Loop through the string until we find the end of the 866 # text. 867 while end < len(structured_text): 868 # See if we are at the end of a paragraph. 869 match = __paragraph_regexp.match(structured_text, end) 870 if match: 871 # Add the new paragraph to the list. 872 paragraphs.append(structured_text[begin:end]) 873 # The next paragraph begins with the first 874 # matched character. 875 begin = match.end() 876 end = begin 877 break 878 else: 879 # Advance to the next character. 880 end = end + 1 881 882 # We may have stopped in the middle of a paragraph. 883 if begin != end: 884 paragraphs.append(structured_text[begin:end]) 885 886 return paragraphs
887 888
889 -def get_first_paragraph(structured_text):
890 """Return the first paragraph of 'structured_text'. 891 892 'structured_text' -- A string consisting of structured text. 893 894 returns -- A string of structured text that is the first paragraph 895 of the 'structured_text'.""" 896 897 return get_paragraphs(structured_text)[0]
898 899 ######################################################################## 900 # variables 901 ######################################################################## 902 903 # Write a regular expression for finding characters that need to be 904 # escaped as HTML entities. 905 __entity_char_regex = htmlentitydefs.entitydefs.values() 906 # We only handle single-byte characters. 907 __entity_char_regex = filter(lambda l: len(l) == 1, __entity_char_regex) 908 __entity_char_regex = "[" + string.join(__entity_char_regex, "") + "]" 909 __entity_char_regex = re.compile(__entity_char_regex) 910 911 # Generate a replacement function for special characters to HTML 912 # entities. Start by creating a map from the character to the 913 # corresponding HTML entity code. 914 __entity_char_replacement = {} 915 for entity, character in htmlentitydefs.entitydefs.items(): 916 if len(character) == 1: 917 __entity_char_replacement[character] = "&%s;" % entity 918 # Write a function for use as the regex replacement that looks up the 919 # corresponding entity for a matched character. 920 __entity_char_replacement = lambda match, \ 921 replacement_map=__entity_char_replacement: \ 922 replacement_map[match.group(0)] 923 924 # Regex matching paragraph separators. 925 __paragraph_regexp = re.compile("(?:\n *)+\n") 926 927 # Regular expression matching verbatim paragraphs and trailing 928 # whitespace. 929 _verbatim_regexp = re.compile("('''.*''')(?:(?:\n *)+\n|\n?$)", re.DOTALL) 930 931 ######################################################################## 932 # script 933 ######################################################################## 934 935 # If invoked as a script, act as a structured text processor. 936 937 if __name__ == "__main__": 938 # Parse command-line options. 939 import getopt 940 long_options = [ 941 "html", 942 "text", 943 ] 944 options, arguments = getopt.getopt(sys.argv[1:], "", long_options) 945 # Interpret them. 946 formatter = None 947 for option, option_argument in options: 948 if option == "--html": 949 formatter = HtmlFormatter() 950 elif option == "--text": 951 formatter = TextFormatter() 952 # Use a text formatter by default. 953 if formatter is None: 954 formatter = TextFormatter() 955 956 # Fire up a processor. 957 processor = StructuredTextProcessor(formatter) 958 959 # Were input files specified on the command line? 960 if len(arguments) == 0: 961 # No; read from standard input. 962 inputs = (sys.stdin, ) 963 else: 964 # Yes; open them all. 965 inputs = map(lambda file_name: open(file_name, "rt"), arguments) 966 967 # Loop over inputs. 968 for input in inputs: 969 # Read in each one, and process it. 970 processor(input.read()) 971 972 # End processing. 973 processor.End() 974 975 # All done. 976 sys.exit(0) 977 978 979 ######################################################################## 980 # Local Variables: 981 # mode: python 982 # indent-tabs-mode: nil 983 # fill-column: 72 984 # End: 985