=== added file 'Code_python/TextAndFiles/Solutions/html_beautifulsoup_py2.py' --- Code_python/TextAndFiles/Solutions/html_beautifulsoup_py2.py 1970-01-01 00:00:00 +0000 +++ Code_python/TextAndFiles/Solutions/html_beautifulsoup_py2.py 2015-11-17 20:12:00 +0000 @@ -0,0 +1,83 @@ +#!/usr/bin/env python +""" +synopsis: + Demonstrate various capabilities of BeautifulSoup. +usage: + python beautifulsoup1.py ' +examples: + ./html_beautifulsoup_py3.py ../csv_report.html http://www.python.org +""" + +from __future__ import print_function +import sys +import urllib +from bs4 import BeautifulSoup + + +def show(doc): + #print(doc.prettify()) + print('-' * 50) + print('title: {}'.format(doc.title)) + print('-' * 50) + for x in doc.find_all('p'): + print(x.text) + print('-' * 50) + for element in doc.find_all('a'): + href = element.get('href') + text = element.text + print('-----') + print('element: {}'.format(element)) + print('href: {} text: "{}"'.format( + href.encode('utf-8'), + text.encode('utf-8'))) + #return doc + + +def walk_tree(node): + if hasattr(node, 'name') and node.name is not None: + print('name: {} attrs: {}'.format(node.name, node.attrs)) + if hasattr(node, 'children'): + for child in node.children: + walk_tree(child) + + +def main(): + args = sys.argv[1:] + if len(args) != 2: + sys.exit(__doc__) + # + # Read a document from a file. + infilename = args[0] + with open(infilename, 'r') as infile: + content = infile.read() + doc = BeautifulSoup(content, 'lxml') + show(doc) + # + # Download a document from the Internet. + print('=' * 50) + url = args[1] + infile = urllib.urlopen(url) + content = infile.read() + doc = BeautifulSoup(content, 'lxml') + show(doc) + # + # Walk the document tree. + print('=' * 50) + print('tree walk') + walk_tree(doc.html) + # + # Modify the document tree. Add a footer. + tag = doc.new_tag('hr') + doc.body.append(tag) + tag = doc.new_tag('p') + tag.string = 'A simple footer' + doc.body.append(tag) + content = doc.prettify() + with open('tmp1.html', 'w') as outfile: + content = content.encode('u8') + outfile.write(content) + + +if __name__ == '__main__': + #import pdb; pdb.set_trace() + main() === added file 'Code_python/TextAndFiles/Solutions/html_beautifulsoup_py3.py' --- Code_python/TextAndFiles/Solutions/html_beautifulsoup_py3.py 1970-01-01 00:00:00 +0000 +++ Code_python/TextAndFiles/Solutions/html_beautifulsoup_py3.py 2015-11-17 20:12:00 +0000 @@ -0,0 +1,81 @@ +#!/usr/bin/env python +""" +synopsis: + Demonstrate various capabilities of BeautifulSoup. +usage: + python beautifulsoup1.py ' +examples: + ./html_beautifulsoup_py3.py ../csv_report.html http://www.python.org +""" + +from __future__ import print_function +import sys +import urllib.request +from bs4 import BeautifulSoup + + +def show(doc): + #print(doc.prettify()) + print('-' * 50) + print('title: {}'.format(doc.title)) + print('-' * 50) + for x in doc.find_all('p'): + print(x.string) + print('-' * 50) + for element in doc.find_all('a'): + href = element.get('href') + string = element.string + print('-----') + print('element: {}'.format(element)) + print('href: {} string: "{}"'.format(href, string)) + #return doc + + +def walk_tree(node): + print('type: {}'.format(type(node))) + if hasattr(node, 'name') and node.name is not None: + print('name: {} attrs: {}'.format(node.name, node.attrs)) + if hasattr(node, 'children'): + for child in node.children: + walk_tree(child) + + +def main(): + args = sys.argv[1:] + if len(args) != 2: + sys.exit(__doc__) + # + # Read a document from a file. + infilename = args[0] + with open(infilename, 'r') as infile: + content = infile.read() + doc = BeautifulSoup(content, 'lxml') + show(doc) + # + # Download a document from the Internet. + print('=' * 50) + url = args[1] + with urllib.request.urlopen(url) as infile: + content = infile.read() + doc = BeautifulSoup(content, 'lxml') + show(doc) + # + # Walk the document tree. + print('=' * 50) + print('tree walk') + walk_tree(doc.html) + # + # Modify the document tree. Add a footer. + tag = doc.new_tag('hr') + doc.body.append(tag) + tag = doc.new_tag('p') + tag.string = 'A simple footer' + doc.body.append(tag) + content = doc.prettify() + with open('tmp1.html', 'w') as outfile: + outfile.write(content) + + +if __name__ == '__main__': + #import pdb; pdb.set_trace() + main() === modified file 'Code_python/TextAndFiles/Solutions/re_scan.py' --- Code_python/TextAndFiles/Solutions/re_scan.py 2013-11-12 00:27:30 +0000 +++ Code_python/TextAndFiles/Solutions/re_scan.py 2015-11-17 20:12:00 +0000 @@ -36,17 +36,13 @@ PAT1 = re.compile(r'^class') - # # Functions for external use, factories, etc: - # # Classes: - - class RegExprUtils(object): def __init__(self, file_pat): self.file_pat = file_pat @@ -78,7 +74,7 @@ line1 = re_pat.sub(repl, line) if line1 != line: print 'Repl: %s' % (line1, ) - + def extract(self, pat): """Scan files for match and extract group. Show items in group. """ @@ -94,7 +90,7 @@ infilename, lineno, line, ) for item in groups: print ' Match: "%s"' % (item, ) - + class Prompter(cmd.Cmd): def __init__(self, file_pat): @@ -142,7 +138,6 @@ pat = args[0] self.regexprutils.extract(pat) - def do_exit(self, args): """Exit from the command loop. """ @@ -155,15 +150,13 @@ pass - # # Functions for internal use: - - USAGE_TEXT = __doc__ + def usage(): print USAGE_TEXT sys.exit(1) @@ -172,12 +165,9 @@ def main(): args = sys.argv[1:] try: - opts, args = getopt.getopt(args, 'hva:', ['help', - ]) + opts, args = getopt.getopt(args, 'hva:', ['help', ]) except: usage() - verbose = False - avalue = None for opt, val in opts: if opt in ('-h', '--help'): usage() @@ -220,5 +210,3 @@ if __name__ == '__main__': #import pdb; pdb.set_trace() main() - - === added file 'Code_python/TextAndFiles/html_beautifulsoup.html' --- Code_python/TextAndFiles/html_beautifulsoup.html 1970-01-01 00:00:00 +0000 +++ Code_python/TextAndFiles/html_beautifulsoup.html 2015-11-17 20:12:00 +0000 @@ -0,0 +1,238 @@ + + + + + + +Parsing html with BeautifulSoup + + + +
+

Parsing html with BeautifulSoup

+ +

You can find documentation on BeautifulSoup here: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/

+

Hint -- Read both Part 1" and "Part 2" before starting work. That +might enable you to avoid some re-factoring.

+
+

Part 1

+

Read an HTML file and parse it with BeautifulSoup. Then do each of +the following:

+
    +
  • Print the title of the document.
  • +
  • Print the text from each of the "<p>" elements from the document.
  • +
  • For each "<a>" element in the document, print (1) the value of its +"href" attribute and (2) the text in the element.
  • +
  • Walk the document tree. Write a recursive function to do the +walk. For each element (node) in the document, print out the name +(tag) and the attributes.
  • +
  • Add a footer to the document. For example, add a "<hr/>" element +and a "<p>some text</p>" element. Save the modified document to a +file.
  • +
+

You can use this HTML file (html_beautifulsoup.html) or any +other HTML file for input data.

+
+
+

Part 2

+

Use the "urllib" module from the Python standard library to download +a Web page, then do each of the above tasks.

+
+
+ + + === added file 'Code_python/TextAndFiles/html_beautifulsoup.txt' --- Code_python/TextAndFiles/html_beautifulsoup.txt 1970-01-01 00:00:00 +0000 +++ Code_python/TextAndFiles/html_beautifulsoup.txt 2015-11-17 20:12:00 +0000 @@ -0,0 +1,41 @@ +================================= +Parsing html with BeautifulSoup +================================= + +You can find documentation on BeautifulSoup here: +http://www.crummy.com/software/BeautifulSoup/bs4/doc/ + +Hint -- Read both Part 1" and "Part 2" before starting work. That +might enable you to avoid some re-factoring. + + +Part 1 +======== + +Read an HTML file and parse it with BeautifulSoup. Then do each of +the following: + +- Print the title of the document. + +- Print the text from each of the "

" elements from the document. + +- For each "" element in the document, print (1) the value of its + "href" attribute and (2) the text in the element. + +- Walk the document tree. Write a recursive function to do the + walk. For each element (node) in the document, print out the name + (tag) and the attributes. + +- Add a footer to the document. For example, add a "


" element + and a "

some text

" element. Save the modified document to a + file. + +You can use this HTML file (``html_beautifulsoup.html``) or any +other HTML file for input data. + + +Part 2 +======== + +Use the "urllib" module from the Python standard library to download +a Web page, then do each of the above tasks. === modified file 'Docs/agenda_4day.html' --- Docs/agenda_4day.html 2013-11-12 00:27:30 +0000 +++ Docs/agenda_4day.html 2015-11-17 20:12:00 +0000 @@ -3,7 +3,7 @@ - + Python Training Agenda

Python Training Agenda

-

This document provides an agenda for a four-day course in beginning -Python programming.

+

This document provides an outline and plan for a four-day course in +beginning Python programming.


Day 1 AM:

    @@ -243,19 +246,20 @@

    Day 3 AM:

      -
    • Statements continued: (1) del, (2) exec and eval().
    • +
    • Statements continued: (1) del, (2) exec and the eval() function.
    • Structured Python -- functions.
    • Practical exercises: statements and functions.

    Day 3 PM:

      +
    • More structured Python -- modules and packages.
    • +
    • Review of Python execution model.
    • +
    • Iterators and generators; list comprehensions, generator +expressions, dictionary and set comprehensions.
    • Structured Python, continued -- Object-oriented programming, classes, and instances.
    • -
    • More structured Python -- modules and packages.
    • -
    • Review of Python execution model.
    • -
    • Iterators and generators.
    • -
    • Practical exercises: (1) functions; (2) classes and object-oriented -programming; (3) iterators and generators.
    • +
    • Practical exercises: (1) functions; (2) iterators and generators; +(3) classes and object-oriented programming;

    Day 4 AM:

    @@ -267,19 +271,17 @@

Day 4 PM:

    -
  • Advanced topics -- (1) Embedded Python. (2) Extended Python; -extending Python in other languages (SWIG, Pyrex, etc). Comparison -with equivalent tasks for Jython.
  • Practical exercises: (1) functions; (2) classes; (3) modules.
  • Practical exercises -- Application related tasks and problems suggested by class members. Possible topics: (1) XML processing in Python; (2) database access with Python; (3) more text processing; -(4) etc.
  • +(4) the Python unit testing framework; (5) etc.
=== modified file 'Docs/agenda_4day.txt' --- Docs/agenda_4day.txt 2013-11-20 20:56:56 +0000 +++ Docs/agenda_4day.txt 2015-11-17 20:12:00 +0000 @@ -2,8 +2,8 @@ Python Training Agenda ======================== -This document provides an agenda for a four-day course in beginning -Python programming. +This document provides an outline and plan for a four-day course in +beginning Python programming. -------------------- @@ -67,7 +67,7 @@ Day 3 AM: -- Statements continued: (1) del, (2) exec and eval(). +- Statements continued: (1) del, (2) exec and the eval() function. - Structured Python -- functions. @@ -75,17 +75,18 @@ Day 3 PM: +- More structured Python -- modules and packages. + +- Review of Python execution model. + +- Iterators and generators; list comprehensions, generator + expressions, dictionary and set comprehensions. + - Structured Python, continued -- Object-oriented programming, classes, and instances. -- More structured Python -- modules and packages. - -- Review of Python execution model. - -- Iterators and generators. - -- Practical exercises: (1) functions; (2) classes and object-oriented - programming; (3) iterators and generators. +- Practical exercises: (1) functions; (2) iterators and generators; + (3) classes and object-oriented programming; -------------------- @@ -107,6 +108,4 @@ - Practical exercises -- Application related tasks and problems suggested by class members. Possible topics: (1) XML processing in Python; (2) database access with Python; (3) more text processing; - (4) etc. - - + (4) the Python unit testing framework; (5) etc. === modified file 'Docs/agenda_brief.txt' --- Docs/agenda_brief.txt 2014-01-17 22:08:55 +0000 +++ Docs/agenda_brief.txt 2015-11-17 20:12:00 +0000 @@ -16,8 +16,12 @@ 6. OOP (object-oriented programming) and classes -7. Modules and packages: import, templates - -8. Debugging and code checkers - -9. Additonal exercises +7. Miscellaneous topics: + + - Modules and packages: import, templates + + - Debugging and code checkers + + - Etc. + +8. Additional exercises === modified file 'README.txt' --- README.txt 2015-10-30 02:52:50 +0000 +++ README.txt 2015-11-17 20:12:00 +0000 @@ -9,7 +9,7 @@ :address: http://www.davekuhlman.org -:revision: 1.0b +:revision: 1.1a :date: |date| .. |date| date:: %B %d, %Y @@ -109,11 +109,13 @@ Materials |-- Code_python # Code and exercises | |-- Cmd + | | `-- Solutions | |-- CommandLineOptions | |-- ConfigParser | |-- Database | |-- Decorators | |-- DocServer + | | `-- Solutions | |-- ExceptionSubclass | |-- FixedLenRecords | |-- Import