~ubuntu-branches/ubuntu/karmic/calibre/karmic

Viewing changes to src/calibre/ebooks/oeb/base.py

Committer: Bazaar Package Importer
Author(s): Martin Pitt
Date: 2009-07-30 12:49:41 UTC
mfrom: (1.3.2 upstream)
Revision ID: james.westby@ubuntu.com-20090730124941-qjdsmri25zt8zocn

Tags: 0.6.3+dfsg-0ubuntu1

* New upstream release. Please see http://calibre.kovidgoyal.net/new_in_6/
  for the list of new features and changes.
* remove_postinstall.patch: Update for new version.
* build_debug.patch: Does not apply any more, disable for now. Might not be
  necessary any more.
* debian/copyright: Fix reference to versionless GPL.
* debian/rules: Drop obsolete dh_desktop call.
* debian/rules: Add workaround for weird Python 2.6 setuptools behaviour of
  putting compiled .so files into src/calibre/plugins/calibre/plugins
  instead of src/calibre/plugins.
* debian/rules: Drop hal fdi moving, new upstream version does not use hal
  any more. Drop hal dependency, too.
* debian/rules: Install udev rules into /lib/udev/rules.d.
* Add debian/calibre.preinst: Remove unmodified
  /etc/udev/rules.d/95-calibre.rules on upgrade.
* debian/control: Bump Python dependencies to 2.6, since upstream needs
  it now.

files added:
debian/calibre.preinst

jsmin.py

src/calibre/customize/conversion.py

src/calibre/customize/profiles.py

src/calibre/devices/android

src/calibre/devices/android/__init__.py

src/calibre/devices/android/driver.py

src/calibre/devices/irexdr

src/calibre/devices/irexdr/__init__.py

src/calibre/devices/irexdr/driver.py

src/calibre/devices/usbms/cli.py

src/calibre/devices/usbms/deviceconfig.py

src/calibre/ebooks/comic

src/calibre/ebooks/comic/__init__.py

src/calibre/ebooks/comic/input.py

src/calibre/ebooks/compression

src/calibre/ebooks/compression/__init__.py

src/calibre/ebooks/compression/palmdoc.c

src/calibre/ebooks/compression/palmdoc.py

src/calibre/ebooks/conversion

src/calibre/ebooks/conversion/__init__.py

src/calibre/ebooks/conversion/cli.py

src/calibre/ebooks/conversion/config.py

src/calibre/ebooks/conversion/plumber.py

src/calibre/ebooks/conversion/preprocess.py

src/calibre/ebooks/epub/input.py

src/calibre/ebooks/epub/output.py

src/calibre/ebooks/fb2

src/calibre/ebooks/fb2/__init__.py

src/calibre/ebooks/fb2/fb2.xsl

src/calibre/ebooks/fb2/fb2ml.py

src/calibre/ebooks/fb2/input.py

src/calibre/ebooks/fb2/output.py

src/calibre/ebooks/html

src/calibre/ebooks/html/__init__.py

src/calibre/ebooks/html/input.py

src/calibre/ebooks/lit/input.py

src/calibre/ebooks/lit/output.py

src/calibre/ebooks/lrf/output.py

src/calibre/ebooks/metadata/cli.py

src/calibre/ebooks/metadata/ereader.py

src/calibre/ebooks/metadata/pdb.py

src/calibre/ebooks/metadata/txt.py

src/calibre/ebooks/metadata/worker.py

src/calibre/ebooks/mobi/input.py

src/calibre/ebooks/mobi/output.py

src/calibre/ebooks/odt/input.py

src/calibre/ebooks/oeb/factory.py

src/calibre/ebooks/oeb/iterator.py

src/calibre/ebooks/oeb/output.py

src/calibre/ebooks/oeb/reader.py

src/calibre/ebooks/oeb/transforms/guide.py

src/calibre/ebooks/oeb/transforms/jacket.py

src/calibre/ebooks/oeb/transforms/linearize_tables.py

src/calibre/ebooks/oeb/transforms/metadata.py

src/calibre/ebooks/oeb/transforms/rescale.py

src/calibre/ebooks/oeb/transforms/split.py

src/calibre/ebooks/oeb/transforms/structure.py

src/calibre/ebooks/oeb/writer.py

src/calibre/ebooks/pdb

src/calibre/ebooks/pdb/__init__.py

src/calibre/ebooks/pdb/ereader

src/calibre/ebooks/pdb/ereader/__init__.py

src/calibre/ebooks/pdb/ereader/inspector.py

src/calibre/ebooks/pdb/ereader/output.py

src/calibre/ebooks/pdb/ereader/reader.py

src/calibre/ebooks/pdb/ereader/reader132.py

src/calibre/ebooks/pdb/ereader/reader202.py

src/calibre/ebooks/pdb/ereader/writer.py

src/calibre/ebooks/pdb/formatreader.py

src/calibre/ebooks/pdb/formatwriter.py

src/calibre/ebooks/pdb/header.py

src/calibre/ebooks/pdb/input.py

src/calibre/ebooks/pdb/output.py

src/calibre/ebooks/pdb/palmdoc

src/calibre/ebooks/pdb/palmdoc/__init__.py

src/calibre/ebooks/pdb/palmdoc/reader.py

src/calibre/ebooks/pdb/palmdoc/writer.py

src/calibre/ebooks/pdb/ztxt

src/calibre/ebooks/pdb/ztxt/__init__.py

src/calibre/ebooks/pdb/ztxt/reader.py

src/calibre/ebooks/pdb/ztxt/writer.py

src/calibre/ebooks/pdf/input.py

src/calibre/ebooks/pdf/manipulate

src/calibre/ebooks/pdf/manipulate/__init__.py

src/calibre/ebooks/pdf/manipulate/cli.py

src/calibre/ebooks/pdf/manipulate/crop.py

src/calibre/ebooks/pdf/manipulate/decrypt.py

src/calibre/ebooks/pdf/manipulate/encrypt.py

src/calibre/ebooks/pdf/manipulate/info.py

src/calibre/ebooks/pdf/manipulate/merge.py

src/calibre/ebooks/pdf/manipulate/reverse.py

src/calibre/ebooks/pdf/manipulate/rotate.py

src/calibre/ebooks/pdf/manipulate/split.py

src/calibre/ebooks/pdf/output.py

src/calibre/ebooks/pdf/pageoptions.py

src/calibre/ebooks/pdf/pdftohtml.py

src/calibre/ebooks/pdf/verify.py

src/calibre/ebooks/pdf/writer.py

src/calibre/ebooks/pml

src/calibre/ebooks/pml/__init__.py

src/calibre/ebooks/pml/input.py

src/calibre/ebooks/pml/output.py

src/calibre/ebooks/pml/pmlconverter.py

src/calibre/ebooks/pml/pmlml.py

src/calibre/ebooks/rb

src/calibre/ebooks/rb/__init__.py

src/calibre/ebooks/rb/input.py

src/calibre/ebooks/rb/output.py

src/calibre/ebooks/rb/rbml.py

src/calibre/ebooks/rb/reader.py

src/calibre/ebooks/rb/writer.py

src/calibre/ebooks/rtf

src/calibre/ebooks/rtf/__init__.py

src/calibre/ebooks/rtf/input.py

src/calibre/ebooks/rtf/output.py

src/calibre/ebooks/rtf/rtfml.py

src/calibre/ebooks/rtf/xsl.py

src/calibre/ebooks/txt

src/calibre/ebooks/txt/__init__.py

src/calibre/ebooks/txt/input.py

src/calibre/ebooks/txt/newlines.py

src/calibre/ebooks/txt/output.py

src/calibre/ebooks/txt/processor.py

src/calibre/ebooks/txt/txtml.py

src/calibre/ebooks/unidecode

src/calibre/ebooks/unidecode/__init__.py

src/calibre/ebooks/unidecode/unicodepoints.py

src/calibre/ebooks/unidecode/unidecoder.py

src/calibre/gui2/convert

src/calibre/gui2/convert/__init__.py

src/calibre/gui2/convert/bulk.py

src/calibre/gui2/convert/comic_input.py

src/calibre/gui2/convert/comic_input.ui

src/calibre/gui2/convert/epub_output.py

src/calibre/gui2/convert/epub_output.ui

src/calibre/gui2/convert/fb2_input.py

src/calibre/gui2/convert/fb2_input.ui

src/calibre/gui2/convert/gui_conversion.py

src/calibre/gui2/convert/look_and_feel.py

src/calibre/gui2/convert/look_and_feel.ui

src/calibre/gui2/convert/lrf_output.py

src/calibre/gui2/convert/lrf_output.ui

src/calibre/gui2/convert/metadata.py

src/calibre/gui2/convert/metadata.ui

src/calibre/gui2/convert/mobi_output.py

src/calibre/gui2/convert/mobi_output.ui

src/calibre/gui2/convert/page_setup.py

src/calibre/gui2/convert/page_setup.ui

src/calibre/gui2/convert/pdb_output.py

src/calibre/gui2/convert/pdb_output.ui

src/calibre/gui2/convert/pdf_input.py

src/calibre/gui2/convert/pdf_input.ui

src/calibre/gui2/convert/pdf_output.py

src/calibre/gui2/convert/pdf_output.ui

src/calibre/gui2/convert/single.py

src/calibre/gui2/convert/single.ui

src/calibre/gui2/convert/structure_detection.py

src/calibre/gui2/convert/structure_detection.ui

src/calibre/gui2/convert/toc.py

src/calibre/gui2/convert/toc.ui

src/calibre/gui2/convert/txt_output.py

src/calibre/gui2/convert/txt_output.ui

src/calibre/gui2/convert/xpath_edit.ui

src/calibre/gui2/convert/xpath_wizard.py

src/calibre/gui2/convert/xpath_wizard.ui

src/calibre/gui2/device_drivers

src/calibre/gui2/device_drivers/__init__.py

src/calibre/gui2/device_drivers/configwidget.py

src/calibre/gui2/device_drivers/configwidget.ui

src/calibre/gui2/images/eject.svg

src/calibre/gui2/images/news/elperiodico_catalan.png

src/calibre/gui2/images/news/elperiodico_spanish.png

src/calibre/gui2/images/news/eltiempo_hn.png

src/calibre/gui2/images/news/expansion_spanish.png

src/calibre/gui2/images/news/fastcompany.png

src/calibre/gui2/images/news/gva_be.png

src/calibre/gui2/images/news/hln.png

src/calibre/gui2/images/news/inquirer_net.png

src/calibre/gui2/images/news/laprensa_hn.png

src/calibre/gui2/images/news/latribuna.png

src/calibre/gui2/images/news/noaa.png

src/calibre/gui2/images/news/theeconomictimes_india.png

src/calibre/gui2/images/news/tijd.png

src/calibre/gui2/images/news/uncrate.png

src/calibre/gui2/images/print-preview.svg

src/calibre/gui2/images/print.svg

src/calibre/gui2/images/welcome_wizard.svg

src/calibre/gui2/images/wizard.svg

src/calibre/gui2/jobs.py

src/calibre/gui2/tag_view.py

src/calibre/gui2/viewer/bookmarkmanager.py

src/calibre/gui2/viewer/bookmarkmanager.ui

src/calibre/gui2/viewer/hyphenate

src/calibre/gui2/viewer/hyphenate/Hyphenator.js

src/calibre/gui2/viewer/hyphenate/patterns

src/calibre/gui2/viewer/hyphenate/patterns/bn.js

src/calibre/gui2/viewer/hyphenate/patterns/cs.js

src/calibre/gui2/viewer/hyphenate/patterns/da.js

src/calibre/gui2/viewer/hyphenate/patterns/de.js

src/calibre/gui2/viewer/hyphenate/patterns/en.js

src/calibre/gui2/viewer/hyphenate/patterns/es.js

src/calibre/gui2/viewer/hyphenate/patterns/fi.js

src/calibre/gui2/viewer/hyphenate/patterns/fr.js

src/calibre/gui2/viewer/hyphenate/patterns/gu.js

src/calibre/gui2/viewer/hyphenate/patterns/hi.js

src/calibre/gui2/viewer/hyphenate/patterns/hu.js

src/calibre/gui2/viewer/hyphenate/patterns/it.js

src/calibre/gui2/viewer/hyphenate/patterns/kn.js

src/calibre/gui2/viewer/hyphenate/patterns/ml.js

src/calibre/gui2/viewer/hyphenate/patterns/nl.js

src/calibre/gui2/viewer/hyphenate/patterns/or.js

src/calibre/gui2/viewer/hyphenate/patterns/pa.js

src/calibre/gui2/viewer/hyphenate/patterns/pl.js

src/calibre/gui2/viewer/hyphenate/patterns/pt.js

src/calibre/gui2/viewer/hyphenate/patterns/ru.js

src/calibre/gui2/viewer/hyphenate/patterns/sv.js

src/calibre/gui2/viewer/hyphenate/patterns/ta.js

src/calibre/gui2/viewer/hyphenate/patterns/te.js

src/calibre/gui2/viewer/hyphenate/patterns/uk.js

src/calibre/gui2/viewer/printing.py

src/calibre/gui2/wizard

src/calibre/gui2/wizard/__init__.py

src/calibre/gui2/wizard/device.ui

src/calibre/gui2/wizard/finish.ui

src/calibre/gui2/wizard/kindle.ui

src/calibre/gui2/wizard/library.ui

src/calibre/gui2/wizard/send_email.py

src/calibre/gui2/wizard/send_email.ui

src/calibre/gui2/wizard/stanza.ui

src/calibre/library/move.py

src/calibre/library/static/calibre_banner.png

src/calibre/utils/complete.py

src/calibre/utils/fonts

src/calibre/utils/fonts/__init__.py

src/calibre/utils/fonts/fontconfig.c

src/calibre/utils/ipc

src/calibre/utils/ipc/__init__.py

src/calibre/utils/ipc/job.py

src/calibre/utils/ipc/launch.py

src/calibre/utils/ipc/server.py

src/calibre/utils/ipc/worker.py

src/calibre/utils/logging.py

src/calibre/web/feeds/input.py

src/calibre/web/feeds/recipes/recipe_7dias.py

src/calibre/web/feeds/recipes/recipe_accountancyage.py

src/calibre/web/feeds/recipes/recipe_buenosaireseconomico.py

src/calibre/web/feeds/recipes/recipe_craigslist.py

src/calibre/web/feeds/recipes/recipe_degentenaar.py

src/calibre/web/feeds/recipes/recipe_diagonales.py

src/calibre/web/feeds/recipes/recipe_elperiodico_catalan.py

src/calibre/web/feeds/recipes/recipe_elperiodico_spanish.py

src/calibre/web/feeds/recipes/recipe_eltiempo_hn.py

src/calibre/web/feeds/recipes/recipe_expansion_spanish.py

src/calibre/web/feeds/recipes/recipe_fastcompany.py

src/calibre/web/feeds/recipes/recipe_gva_be.py

src/calibre/web/feeds/recipes/recipe_hln.py

src/calibre/web/feeds/recipes/recipe_inquirer_net.py

src/calibre/web/feeds/recipes/recipe_kellog_faculty.py

src/calibre/web/feeds/recipes/recipe_kellog_insight.py

src/calibre/web/feeds/recipes/recipe_laprensa_hn.py

src/calibre/web/feeds/recipes/recipe_latribuna.py

src/calibre/web/feeds/recipes/recipe_lavanguardia.py

src/calibre/web/feeds/recipes/recipe_marca.py

src/calibre/web/feeds/recipes/recipe_miradasalsur.py

src/calibre/web/feeds/recipes/recipe_newsweek_argentina.py

src/calibre/web/feeds/recipes/recipe_noaa.py

src/calibre/web/feeds/recipes/recipe_publico.py

src/calibre/web/feeds/recipes/recipe_slate.py

src/calibre/web/feeds/recipes/recipe_the_budget_fashionista.py

src/calibre/web/feeds/recipes/recipe_theeconomictimes_india.py

src/calibre/web/feeds/recipes/recipe_tijd.py

src/calibre/web/feeds/recipes/recipe_uncrate.py

src/calibre/web/feeds/recipes/recipe_veintitres.py

files removed:
src/calibre/ebooks/epub/fonts.py

src/calibre/ebooks/epub/from_any.py

src/calibre/ebooks/epub/from_comic.py

src/calibre/ebooks/epub/from_feeds.py

src/calibre/ebooks/epub/from_html.py

src/calibre/ebooks/epub/iterator.py

src/calibre/ebooks/epub/split.py

src/calibre/ebooks/html.py

src/calibre/ebooks/lrf/any

src/calibre/ebooks/lrf/any/__init__.py

src/calibre/ebooks/lrf/any/convert_from.py

src/calibre/ebooks/lrf/comic

src/calibre/ebooks/lrf/comic/__init__.py

src/calibre/ebooks/lrf/comic/convert_from.py

src/calibre/ebooks/lrf/epub

src/calibre/ebooks/lrf/epub/__init__.py

src/calibre/ebooks/lrf/epub/convert_from.py

src/calibre/ebooks/lrf/fb2

src/calibre/ebooks/lrf/fb2/__init__.py

src/calibre/ebooks/lrf/fb2/convert_from.py

src/calibre/ebooks/lrf/fb2/fb2.xsl

src/calibre/ebooks/lrf/feeds

src/calibre/ebooks/lrf/feeds/__init__.py

src/calibre/ebooks/lrf/feeds/convert_from.py

src/calibre/ebooks/lrf/lit

src/calibre/ebooks/lrf/lit/__init__.py

src/calibre/ebooks/lrf/lit/convert_from.py

src/calibre/ebooks/lrf/mobi

src/calibre/ebooks/lrf/mobi/__init__.py

src/calibre/ebooks/lrf/mobi/convert_from.py

src/calibre/ebooks/lrf/pdf

src/calibre/ebooks/lrf/pdf/__init__.py

src/calibre/ebooks/lrf/pdf/convert_from.py

src/calibre/ebooks/lrf/pdf/reflow.py

src/calibre/ebooks/lrf/rtf

src/calibre/ebooks/lrf/rtf/__init__.py

src/calibre/ebooks/lrf/rtf/convert_from.py

src/calibre/ebooks/lrf/rtf/xsl.py

src/calibre/ebooks/lrf/txt

src/calibre/ebooks/lrf/txt/__init__.py

src/calibre/ebooks/lrf/txt/convert_from.py

src/calibre/ebooks/lrf/txt/demo

src/calibre/ebooks/lrf/txt/demo/demo.txt

src/calibre/ebooks/lrf/txt/demo/small.jpg

src/calibre/ebooks/lrf/web

src/calibre/ebooks/lrf/web/__init__.py

src/calibre/ebooks/lrf/web/convert_from.py

src/calibre/ebooks/lrf/web/profiles

src/calibre/ebooks/lrf/web/profiles/__init__.py

src/calibre/ebooks/lrf/web/profiles/ap.py

src/calibre/ebooks/lrf/web/profiles/atlantic.py

src/calibre/ebooks/lrf/web/profiles/automatic.py

src/calibre/ebooks/lrf/web/profiles/barrons.py

src/calibre/ebooks/lrf/web/profiles/bbc.py

src/calibre/ebooks/lrf/web/profiles/chr_mon.py

src/calibre/ebooks/lrf/web/profiles/cnn.py

src/calibre/ebooks/lrf/web/profiles/economist.py

src/calibre/ebooks/lrf/web/profiles/faznet.py

src/calibre/ebooks/lrf/web/profiles/jpost.py

src/calibre/ebooks/lrf/web/profiles/jutarnji.py

src/calibre/ebooks/lrf/web/profiles/nasa.py

src/calibre/ebooks/lrf/web/profiles/newsweek.py

src/calibre/ebooks/lrf/web/profiles/newyorker.py

src/calibre/ebooks/lrf/web/profiles/newyorkreview.py

src/calibre/ebooks/lrf/web/profiles/nytimes.py

src/calibre/ebooks/lrf/web/profiles/portfolio.py

src/calibre/ebooks/lrf/web/profiles/reuters.py

src/calibre/ebooks/lrf/web/profiles/spiegelde.py

src/calibre/ebooks/lrf/web/profiles/upi.py

src/calibre/ebooks/lrf/web/profiles/usatoday.py

src/calibre/ebooks/lrf/web/profiles/wash_post.py

src/calibre/ebooks/lrf/web/profiles/wsj.py

src/calibre/ebooks/lrf/web/profiles/zeitde.py

src/calibre/ebooks/mobi/from_any.py

src/calibre/ebooks/mobi/from_comic.py

src/calibre/ebooks/mobi/from_feeds.py

src/calibre/ebooks/mobi/palmdoc.py

src/calibre/ebooks/odt/to_oeb.py

src/calibre/ebooks/pdf/pdftrim.py

src/calibre/gui2/dialogs/epub.py

src/calibre/gui2/dialogs/epub.ui

src/calibre/gui2/dialogs/jobs.py

src/calibre/gui2/dialogs/lrf_single.py

src/calibre/gui2/dialogs/lrf_single.ui

src/calibre/gui2/dialogs/mobi.py

src/calibre/gui2/dialogs/warning.ui

src/calibre/gui2/jobs2.py

src/calibre/gui2/tags.py

src/calibre/parallel.py

src/calibre/trac/donations

src/calibre/trac/donations/server.py

src/calibre/utils/fontconfig.py

src/calibre/utils/single_qt_application.py

src/calibre/web/feeds/main.py

files modified:
.pydevproject

debian/changelog

debian/control

debian/copyright

debian/patches/remove_postinstall.patch

debian/patches/series

debian/rules

installer/linux/freeze.py

installer/osx/freeze.py

installer/windows/build_installer.py

installer/windows/calibre/calibre.mpi

installer/windows/freeze.py

setup.py

src/calibre/__init__.py

src/calibre/constants.py

src/calibre/customize/__init__.py

src/calibre/customize/builtins.py

src/calibre/customize/ui.py

src/calibre/debug.py

src/calibre/devices/__init__.py

src/calibre/devices/bebook/driver.py

src/calibre/devices/blackberry/driver.py

src/calibre/devices/cybookg3/driver.py

src/calibre/devices/eb600/driver.py *

src/calibre/devices/interface.py

src/calibre/devices/jetbook/driver.py

src/calibre/devices/kindle/driver.py *

src/calibre/devices/libusb.py

src/calibre/devices/prs500/books.py

src/calibre/devices/prs500/cli/main.py

src/calibre/devices/prs500/driver.py *

src/calibre/devices/prs500/prstypes.py

src/calibre/devices/prs505/books.py

src/calibre/devices/prs505/driver.py

src/calibre/devices/prs700/driver.py

src/calibre/devices/usbms/books.py

src/calibre/devices/usbms/device.py

src/calibre/devices/usbms/driver.py

src/calibre/ebooks/__init__.py

src/calibre/ebooks/chardet/__init__.py

src/calibre/ebooks/epub/__init__.py

src/calibre/ebooks/epub/pages.py

src/calibre/ebooks/lit/reader.py

src/calibre/ebooks/lit/writer.py

src/calibre/ebooks/lrf/__init__.py

src/calibre/ebooks/lrf/html/convert_from.py

src/calibre/ebooks/lrf/lrs/convert_from.py

src/calibre/ebooks/lrf/meta.py

src/calibre/ebooks/lrf/objects.py

src/calibre/ebooks/lrf/tags.py

src/calibre/ebooks/metadata/__init__.py

src/calibre/ebooks/metadata/epub.py

src/calibre/ebooks/metadata/fb2.py

src/calibre/ebooks/metadata/fetch.py

src/calibre/ebooks/metadata/google_books.py

src/calibre/ebooks/metadata/html.py

src/calibre/ebooks/metadata/imp.py

src/calibre/ebooks/metadata/isbndb.py

src/calibre/ebooks/metadata/lit.py

src/calibre/ebooks/metadata/lrx.py

src/calibre/ebooks/metadata/meta.py

src/calibre/ebooks/metadata/mobi.py

src/calibre/ebooks/metadata/ncx.xml

src/calibre/ebooks/metadata/odt.py

src/calibre/ebooks/metadata/opf.py

src/calibre/ebooks/metadata/opf.xml

src/calibre/ebooks/metadata/opf2.py

src/calibre/ebooks/metadata/pdf.py

src/calibre/ebooks/metadata/rb.py

src/calibre/ebooks/metadata/rtf.py

src/calibre/ebooks/metadata/toc.py

src/calibre/ebooks/mobi/langcodes.py

src/calibre/ebooks/mobi/mobiml.py

src/calibre/ebooks/mobi/reader.py

src/calibre/ebooks/mobi/writer.py

src/calibre/ebooks/oeb/base.py

src/calibre/ebooks/oeb/stylizer.py

src/calibre/ebooks/oeb/transforms/__init__.py

src/calibre/ebooks/oeb/transforms/flatcss.py

src/calibre/ebooks/oeb/transforms/htmltoc.py

src/calibre/ebooks/oeb/transforms/manglecase.py

src/calibre/ebooks/oeb/transforms/rasterize.py

src/calibre/ebooks/oeb/transforms/trimmanifest.py

src/calibre/ebooks/rtf2xml/ParseRtf.py

src/calibre/ebooks/rtf2xml/pict.py

src/calibre/gui2/__init__.py

src/calibre/gui2/add.py

src/calibre/gui2/device.py

src/calibre/gui2/dialogs/comicconf.ui

src/calibre/gui2/dialogs/config.py

src/calibre/gui2/dialogs/config.ui

src/calibre/gui2/dialogs/fetch_metadata.py

src/calibre/gui2/dialogs/jobs.ui

src/calibre/gui2/dialogs/metadata_bulk.py

src/calibre/gui2/dialogs/metadata_bulk.ui

src/calibre/gui2/dialogs/metadata_single.py

src/calibre/gui2/dialogs/metadata_single.ui

src/calibre/gui2/dialogs/progress.py

src/calibre/gui2/dialogs/scheduler.py

src/calibre/gui2/dialogs/scheduler.ui

src/calibre/gui2/dialogs/tag_editor.py

src/calibre/gui2/dialogs/tag_editor.ui

src/calibre/gui2/dialogs/user_profiles.py

src/calibre/gui2/dialogs/user_profiles.ui

src/calibre/gui2/filename_pattern.ui

src/calibre/gui2/images/back.svg

src/calibre/gui2/images/forward.svg

src/calibre/gui2/library.py

src/calibre/gui2/main.py

src/calibre/gui2/main.ui

src/calibre/gui2/main_window.py

src/calibre/gui2/status.py

src/calibre/gui2/tools.py

src/calibre/gui2/viewer/config.ui

src/calibre/gui2/viewer/documentview.py

src/calibre/gui2/viewer/js.py

src/calibre/gui2/viewer/main.py

src/calibre/gui2/viewer/main.ui

src/calibre/gui2/widgets.py

src/calibre/library/__init__.py

src/calibre/library/database.py

src/calibre/library/database2.py

src/calibre/library/server.py

src/calibre/library/sqlite.py

src/calibre/library/static/calibre.png

src/calibre/libunrar.py

src/calibre/libunzip.py

src/calibre/linux.py

src/calibre/manual/custom.py

src/calibre/manual/faq.rst

src/calibre/manual/news.rst

src/calibre/manual/news_recipe.rst

src/calibre/ptempfile.py

src/calibre/trac/plugins/Changelog.py

src/calibre/trac/plugins/download.py

src/calibre/trac/plugins/templates/linux.html

src/calibre/translations/ar.po

src/calibre/translations/bg.po

src/calibre/translations/ca.po

src/calibre/translations/calibre.pot

src/calibre/translations/cs.po

src/calibre/translations/da.po

src/calibre/translations/de.po

src/calibre/translations/el.po

src/calibre/translations/es.po

src/calibre/translations/fr.po

src/calibre/translations/gl.po

src/calibre/translations/he.po

src/calibre/translations/hr.po

src/calibre/translations/hu.po

src/calibre/translations/it.po

src/calibre/translations/ja.po

src/calibre/translations/nb.po

src/calibre/translations/nds.po

src/calibre/translations/nl.po

src/calibre/translations/pl.po

src/calibre/translations/pt.po

src/calibre/translations/ro.po

src/calibre/translations/ru.po

src/calibre/translations/sk.po

src/calibre/translations/sl.po

src/calibre/translations/sv.po

src/calibre/translations/te.po

src/calibre/translations/uk.po

src/calibre/utils/config.py

src/calibre/utils/filenames.py

src/calibre/utils/lock.py

src/calibre/utils/podofo/__init__.py

src/calibre/utils/podofo/podofo.cpp

src/calibre/utils/search_query_parser.py

src/calibre/utils/terminfo.py

src/calibre/utils/windows/winutil.c

src/calibre/web/__init__.py

src/calibre/web/feeds/__init__.py

src/calibre/web/feeds/news.py

src/calibre/web/feeds/recipes/__init__.py

src/calibre/web/feeds/recipes/recipe_al_jazeera.py

src/calibre/web/feeds/recipes/recipe_azstarnet.py

src/calibre/web/feeds/recipes/recipe_barrons.py

src/calibre/web/feeds/recipes/recipe_bbc.py

src/calibre/web/feeds/recipes/recipe_clarin.py

src/calibre/web/feeds/recipes/recipe_climate_progress.py

src/calibre/web/feeds/recipes/recipe_coding_horror.py

src/calibre/web/feeds/recipes/recipe_dna.py

src/calibre/web/feeds/recipes/recipe_economist.py

src/calibre/web/feeds/recipes/recipe_elektrolese.py

src/calibre/web/feeds/recipes/recipe_espn.py

src/calibre/web/feeds/recipes/recipe_estadao.py

src/calibre/web/feeds/recipes/recipe_globe_and_mail.py

src/calibre/web/feeds/recipes/recipe_guardian.py

src/calibre/web/feeds/recipes/recipe_harpers.py

src/calibre/web/feeds/recipes/recipe_harpers_full.py

src/calibre/web/feeds/recipes/recipe_jb_online.py

src/calibre/web/feeds/recipes/recipe_linuxdevices.py

src/calibre/web/feeds/recipes/recipe_moneynews.py

src/calibre/web/feeds/recipes/recipe_new_yorker.py

src/calibre/web/feeds/recipes/recipe_newsweek.py

src/calibre/web/feeds/recipes/recipe_nytimes.py

src/calibre/web/feeds/recipes/recipe_nytimes_sub.py

src/calibre/web/feeds/recipes/recipe_o_globo.py

src/calibre/web/feeds/recipes/recipe_san_fran_chronicle.py

src/calibre/web/feeds/recipes/recipe_scott_hanselman.py

src/calibre/web/feeds/recipes/recipe_stackoverflow.py

src/calibre/web/feeds/recipes/recipe_time_magazine.py

src/calibre/web/feeds/recipes/recipe_usatoday.py

src/calibre/web/feeds/recipes/recipe_wash_post.py

src/calibre/web/feeds/recipes/recipe_winsupersite.py

src/calibre/web/feeds/recipes/recipe_wired.py

src/calibre/web/feeds/recipes/recipe_wsj.py

src/calibre/web/feeds/recipes/recipe_zaobao.py

src/calibre/web/feeds/templates.py

src/calibre/web/fetch/simple.py

src/calibre/www/settings.py

src/calibre/www/static/img/faces/john.png

src/pyPdf/pdf.py

todo

upload.py

Show diffs side-by-side

added added

removed removed

src/calibre/ebooks/oeb/base.py

__license__ = 'GPL v3'

__copyright__ = '2008, Marshall T. Vandegrift <llasram@gmail.com>'

__docformat__ = 'restructuredtext en'

import os, sys, re, uuid, copy

from mimetypes import types_map, guess_type

import os, re, uuid, logging

from mimetypes import types_map

from collections import defaultdict

from types import StringTypes

from itertools import izip, count, chain

from itertools import count

from urlparse import urldefrag, urlparse, urlunparse

from urllib import unquote as urlunquote

from urlparse import urljoin

from lxml import etree, html

import calibre

from calibre import LoggingInterface

from cssutils import CSSParser

from calibre.translations.dynamic import translate

from calibre.startup import get_lang

from calibre.ebooks.chardet import xml_to_unicode

from calibre.ebooks.oeb.entitydefs import ENTITYDEFS

from calibre.ebooks.metadata.epub import CoverRenderer

from calibre.ptempfile import TemporaryDirectory

from calibre.ebooks.conversion.preprocess import CSSPreProcessor

XML_NS = 'http://www.w3.org/XML/1998/namespace'

XHTML_NS = 'http://www.w3.org/1999/xhtml'

SVG_NS = 'http://www.w3.org/2000/svg'

XLINK_NS = 'http://www.w3.org/1999/xlink'

CALIBRE_NS = 'http://calibre.kovidgoyal.net/2009/metadata'

XPNSMAP = {

'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS,

'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,

'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS,

'svg': SVG_NS, 'xl' : XLINK_NS

}

DC_PREFIXES = ('d11', 'd10', 'd09')

RE_NS = 'http://exslt.org/regular-expressions'

MBP_NS = 'http://www.mobipocket.com'

XPNSMAP = {'h' : XHTML_NS, 'o1' : OPF1_NS, 'o2' : OPF2_NS,

'd09': DC09_NS, 'd10': DC10_NS, 'd11': DC11_NS,

'xsi': XSI_NS, 'dt' : DCTERMS_NS, 'ncx': NCX_NS,

'svg': SVG_NS, 'xl' : XLINK_NS, 're': RE_NS,

'mbp': MBP_NS, 'calibre': CALIBRE_NS }

OPF1_NSMAP = {'dc': DC11_NS, 'oebpackage': OPF1_NS}

OPF2_NSMAP = {'opf': OPF2_NS, 'dc': DC11_NS, 'dcterms': DCTERMS_NS,

'xsi': XSI_NS, 'calibre': CALIBRE_NS}

def XML(name):

return '{%s}%s' % (XML_NS, name)

def CALIBRE(name):

return '{%s}%s' % (CALIBRE_NS, name)

def LINK_SELECTORS():

results = []

for expr in ('h:head/h:link/@href', 'h:body//h:a/@href',

'h:body//h:img/@src', 'h:body//h:object/@data',

'h:body//*/@xl:href', '//ncx:content/@src',

'o2:page/@href'):

results.append(etree.XPath(expr, namespaces=XPNSMAP))

return results

LINK_SELECTORS = LINK_SELECTORS()

_css_url_re = re.compile(r'url$(.*?)$', re.I)

_css_import_re = re.compile(r'@import "(.*?)"')

_archive_re = re.compile(r'[^ ]+')

def iterlinks(root):

'''

Iterate over all links in a OEB Document.

:param root: A valid lxml.etree element.

'''

assert etree.iselement(root)

link_attrs = set(html.defs.link_attrs)

link_attrs.add(XLINK('href'))

100

for el in root.iter():

101

attribs = el.attrib

102

try:

103

tag = el.tag

104

except UnicodeDecodeError:

105

continue

106

107

if tag == XHTML('object'):

108

codebase = None

109

## <object> tags have attributes that are relative to

110

## codebase

111

if 'codebase' in attribs:

112

codebase = el.get('codebase')

113

yield (el, 'codebase', codebase, 0)

114

for attrib in 'classid', 'data':

115

if attrib in attribs:

116

value = el.get(attrib)

117

if codebase is not None:

118

value = urljoin(codebase, value)

119

yield (el, attrib, value, 0)

120

if 'archive' in attribs:

121

for match in _archive_re.finditer(el.get('archive')):

122

value = match.group(0)

123

if codebase is not None:

124

value = urljoin(codebase, value)

125

yield (el, 'archive', value, match.start())

126

else:

127

for attr in attribs:

128

if attr in link_attrs:

129

yield (el, attr, attribs[attr], 0)

130

131

132

if tag == XHTML('style') and el.text:

133

for match in _css_url_re.finditer(el.text):

134

yield (el, None, match.group(1), match.start(1))

135

for match in _css_import_re.finditer(el.text):

136

yield (el, None, match.group(1), match.start(1))

137

if 'style' in attribs:

138

for match in _css_url_re.finditer(attribs['style']):

139

yield (el, 'style', match.group(1), match.start(1))

140

141

def make_links_absolute(root, base_url):

142

'''

143

Make all links in the document absolute, given the

144

``base_url`` for the document (the full URL where the document

145

came from)

146

'''

147

def link_repl(href):

148

return urljoin(base_url, href)

149

rewrite_links(root, link_repl)

150

151

def resolve_base_href(root):

152

base_href = None

153

basetags = root.xpath('//base[@href]|//h:base[@href]',

154

namespaces=XPNSMAP)

155

for b in basetags:

156

base_href = b.get('href')

157

b.drop_tree()

158

if not base_href:

159

return

160

make_links_absolute(root, base_href, resolve_base_href=False)

161

162

def rewrite_links(root, link_repl_func, resolve_base_href=False):

163

'''

164

Rewrite all the links in the document. For each link

165

``link_repl_func(link)`` will be called, and the return value

166

will replace the old link.

167

168

Note that links may not be absolute (unless you first called

169

``make_links_absolute()``), and may be internal (e.g.,

170

``'#anchor'``). They can also be values like

171

``'mailto:email'`` or ``'javascript:expr'``.

172

173

If the ``link_repl_func`` returns None, the attribute or

174

tag text will be removed completely.

175

'''

176

if resolve_base_href:

177

resolve_base_href(root)

178

for el, attrib, link, pos in iterlinks(root):

179

new_link = link_repl_func(link.strip())

180

if new_link == link:

181

continue

182

if new_link is None:

183

# Remove the attribute or element content

184

if attrib is None:

185

el.text = ''

186

else:

187

del el.attrib[attrib]

188

continue

189

if attrib is None:

190

new = el.text[:pos] + new_link + el.text[pos+len(link):]

191

el.text = new

192

else:

193

cur = el.attrib[attrib]

194

if not pos and len(cur) == len(link):

195

# Most common case

196

el.attrib[attrib] = new_link

197

else:

198

new = cur[:pos] + new_link + cur[pos+len(link):]

199

el.attrib[attrib] = new

200

201

202

EPUB_MIME = types_map['.epub']

203

XHTML_MIME = types_map['.xhtml']

104

214

SVG_MIME = types_map['.svg']

105

215

BINARY_MIME = 'application/octet-stream'

106

216

217

XHTML_CSS_NAMESPACE = u'@namespace "%s";\n' % XHTML_NS

218

107

219

OEB_STYLES = set([CSS_MIME, OEB_CSS_MIME, 'text/x-oeb-css'])

108

OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME, 'text/x-oeb-document'])

220

OEB_DOCS = set([XHTML_MIME, 'text/html', OEB_DOC_MIME,

221

'text/x-oeb-document'])

109

222

OEB_RASTER_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME])

110

223

OEB_IMAGES = set([GIF_MIME, JPEG_MIME, PNG_MIME, SVG_MIME])

111

224

167

280

def xpath(elem, expr):

168

281

return elem.xpath(expr, namespaces=XPNSMAP)

169

282

170

def xml2str(root):

171

return etree.tostring(root, encoding='utf-8', xml_declaration=True)

283

def xml2str(root, pretty_print=False, strip_comments=False):

284

ans = etree.tostring(root, encoding='utf-8', xml_declaration=True,

285

pretty_print=pretty_print)

286

287

if strip_comments:

288

ans = re.compile(r'', re.DOTALL).sub('', ans)

289

290

return ans

291

292

293

def xml2unicode(root, pretty_print=False):

294

return etree.tostring(root, pretty_print=pretty_print)

172

295

173

296

ASCII_CHARS = set(chr(x) for x in xrange(128))

174

297

UNIBYTE_CHARS = set(chr(x) for x in xrange(256))

178

301

URL_UNSAFE = [ASCII_CHARS - URL_SAFE, UNIBYTE_CHARS - URL_SAFE]

179

302

180

303

def urlquote(href):

304

"""Quote URL-unsafe characters, allowing IRI-safe characters."""

181

305

result = []

182

306

unsafe = 0 if isinstance(href, unicode) else 1

183

307

unsafe = URL_UNSAFE[unsafe]

188

312

return ''.join(result)

189

313

190

314

def urlnormalize(href):

315

"""Convert a URL into normalized form, with all and only URL-unsafe

316

characters URL quoted.

317

"""

191

318

parts = urlparse(href)

192

if not parts.scheme:

319

if not parts.scheme or parts.scheme == 'file':

193

320

path, frag = urldefrag(href)

194

321

parts = ('', '', path, '', '', frag)

195

322

parts = (part.replace('\\', '/') for part in parts)

197

324

parts = (urlquote(part) for part in parts)

198

325

return urlunparse(parts)

199

326

327

def merge_multiple_html_heads_and_bodies(root, log=None):

328

heads, bodies = xpath(root, '//h:head'), xpath(root, '//h:body')

329

if not (len(heads) > 1 or len(bodies) > 1): return root

330

for child in root: root.remove(child)

331

head = root.makeelement(XHTML('head'))

332

body = root.makeelement(XHTML('body'))

333

for h in heads:

334

for x in h:

335

head.append(x)

336

for b in bodies:

337

for x in b:

338

body.append(x)

339

map(root.append, (head, body))

340

if log is not None:

341

log.warn('Merging multiple <head> and <body> sections')

342

return root

343

344

345

346

347

348

class DummyHandler(logging.Handler):

349

350

def __init__(self):

351

logging.Handler.__init__(self, logging.WARNING)

352

self.setFormatter(logging.Formatter('%(message)s'))

353

self.log = None

354

355

def emit(self, record):

356

if self.log is not None:

357

msg = self.format(record)

358

f = self.log.error if record.levelno >= logging.ERROR \

359

else self.log.warn

360

f(msg)

361

362

363

_css_logger = logging.getLogger('calibre.css')

364

_css_logger.setLevel(logging.WARNING)

365

_css_log_handler = DummyHandler()

366

_css_logger.addHandler(_css_log_handler)

200

367

201

368

class OEBError(Exception):

369

"""Generic OEB-processing error."""

202

370

pass

203

371

204

372

class NotHTML(OEBError):

373

'''Raised when a file that should be HTML (as per manifest) is not'''

205

374

pass

206

375

207

208

class FauxLogger(object):

209

def __getattr__(self, name):

210

return self

211

def __call__(self, message):

212

print message

213

214

class Logger(LoggingInterface, object):

215

def __getattr__(self, name):

216

return object.__getattribute__(self, 'log_' + name)

217

218

219

class AbstractContainer(object):

220

def read_xml(self, path):

221

return etree.fromstring(

222

self.read(path), base_url=os.path.dirname(path))

223

224

class DirContainer(AbstractContainer):

225

def __init__(self, rootdir):

226

self.rootdir = unicode(rootdir)

227

228

def read(self, path):

376

class NullContainer(object):

377

"""An empty container.

378

379

For use with book formats which do not support container-like access.

380

"""

381

382

def __init__(self, log):

383

self.log = log

384

385

def read(self, path):

386

raise OEBError('Attempt to read from NullContainer')

387

388

def write(self, path):

389

raise OEBError('Attempt to write to NullContainer')

390

391

def exists(self, path):

392

return False

393

394

def namelist(self):

395

return []

396

397

class DirContainer(object):

398

"""Filesystem directory container."""

399

400

def __init__(self, path, log):

401

self.log = log

402

path = unicode(path)

403

ext = os.path.splitext(path)[1].lower()

404

if ext == '.opf':

405

self.opfname = os.path.basename(path)

406

self.rootdir = os.path.dirname(path)

407

return

408

self.rootdir = path

409

for path in self.namelist():

410

ext = os.path.splitext(path)[1].lower()

411

if ext == '.opf':

412

self.opfname = path

413

return

414

self.opfname = None

415

416

def read(self, path):

417

if path is None:

418

path = self.opfname

229

419

path = os.path.join(self.rootdir, path)

230

420

with open(urlunquote(path), 'rb') as f:

231

421

return f.read()

242

432

path = os.path.join(self.rootdir, path)

243

433

return os.path.isfile(urlunquote(path))

244

434

245

class DirWriter(object):

246

def __init__(self, version='2.0', page_map=False):

247

self.version = version

248

self.page_map = page_map

249

250

def dump(self, oeb, path):

251

version = int(self.version[0])

252

opfname = None

253

if os.path.splitext(path)[1].lower() == '.opf':

254

opfname = os.path.basename(path)

255

path = os.path.dirname(path)

256

if not os.path.isdir(path):

257

os.mkdir(path)

258

output = DirContainer(path)

259

for item in oeb.manifest.values():

260

output.write(item.href, str(item))

261

if version == 1:

262

metadata = oeb.to_opf1()

263

elif version == 2:

264

metadata = oeb.to_opf2(page_map=self.page_map)

265

else:

266

raise OEBError("Unrecognized OPF version %r" % self.version)

267

for mime, (href, data) in metadata.items():

268

if opfname and mime == OPF_MIME:

269

href = opfname

270

output.write(href, xml2str(data))

271

return

435

def namelist(self):

436

names = []

437

for root, dirs, files in os.walk(self.rootdir):

438

for fname in files:

439

fname = os.path.join(root, fname)

440

fname = fname.replace('\\', '/')

441

names.append(fname)

442

return names

272

443

273

444

274

445

class Metadata(object):

275

DC_TERMS = set([

276

'contributor', 'coverage', 'creator', 'date',

277

'description', 'format', 'identifier', 'language',

278

'publisher', 'relation', 'rights', 'source', 'subject',

279

'title', 'type'

280

])

281

CALIBRE_TERMS = set(['series', 'series_index', 'rating'])

446

"""A collection of OEB data model metadata.

447

448

Provides access to the list of items associated with a particular metadata

449

term via the term's local name using either Python container or attribute

450

syntax. Return an empty list for any terms with no currently associated

451

metadata items.

452

"""

453

454

DC_TERMS = set(['contributor', 'coverage', 'creator', 'date',

455

'description', 'format', 'identifier', 'language',

456

'publisher', 'relation', 'rights', 'source',

457

'subject', 'title', 'type'])

458

CALIBRE_TERMS = set(['series', 'series_index', 'rating', 'timestamp',

459

'publication_type'])

282

460

OPF_ATTRS = {'role': OPF('role'), 'file-as': OPF('file-as'),

283

461

'scheme': OPF('scheme'), 'event': OPF('event'),

284

462

'type': XSI('type'), 'lang': XML('lang'), 'id': 'id'}

287

465

'xsi': XSI_NS, 'calibre': CALIBRE_NS}

288

466

289

467

class Item(object):

290

468

"""An item of OEB data model metadata.

469

470

The metadata term or name may be accessed via the :attr:`term` or

471

:attr:`name` attributes. The metadata value or content may be accessed

472

via the :attr:`value` or :attr:`content` attributes, or via Unicode or

473

string representations of the object.

474

475

OEB data model metadata attributes may be accessed either via their

476

fully-qualified names using the Python container access syntax, or via

477

their local names using Python attribute syntax. Only attributes

478

allowed by the OPF 2.0 specification are supported.

479

"""

291

480

class Attribute(object):

481

"""Smart accessor for allowed OEB metadata item attributes."""

292

482

293

483

def __init__(self, attr, allowed=None):

294

484

if not callable(attr):

340

530

if attr != nsattr:

341

531

attrib[nsattr] = attrib.pop(attr)

342

532

343

scheme = Attribute(lambda term : 'scheme' if term == OPF('meta') else OPF('scheme'),

344

[DC('identifier'), OPF('meta')])

345

file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor')])

533

@dynamic_property

534

def name(self):

535

def fget(self):

536

return self.term

537

return property(fget=fget)

538

539

@dynamic_property

540

def content(self):

541

def fget(self):

542

return self.value

543

def fset(self, value):

544

self.value = value

545

return property(fget=fget, fset=fset)

546

547

scheme = Attribute(lambda term: 'scheme' if \

548

term == OPF('meta') else OPF('scheme'),

549

[DC('identifier'), OPF('meta')])

550

file_as = Attribute(OPF('file-as'), [DC('creator'), DC('contributor'),

551

DC('title')])

346

552

role = Attribute(OPF('role'), [DC('creator'), DC('contributor')])

347

553

event = Attribute(OPF('event'), [DC('date')])

348

554

id = Attribute('id')

349

type = Attribute(XSI('type'), [DC('date'), DC('format'), DC('type')])

555

type = Attribute(XSI('type'), [DC('date'), DC('format'),

556

DC('type')])

350

557

lang = Attribute(XML('lang'), [DC('contributor'), DC('coverage'),

351

DC('creator'), DC('publisher'),

352

DC('relation'), DC('rights'),

353

DC('source'), DC('subject'),

354

OPF('meta')])

558

DC('creator'), DC('publisher'),

559

DC('relation'), DC('rights'),

560

DC('source'), DC('subject'),

561

OPF('meta')])

355

562

356

563

def __getitem__(self, key):

357

564

return self.attrib[key]

409

616

self.items = defaultdict(list)

410

617

411

618

def add(self, term, value, attrib={}, nsmap={}, **kwargs):

619

"""Add a new metadata item."""

412

620

item = self.Item(term, value, attrib, nsmap, **kwargs)

413

621

items = self.items[barename(item.term)]

414

622

items.append(item)

419

627

yield key

420

628

__iter__ = iterkeys

421

629

630

def clear(self, key):

631

l = self.items[key]

632

for x in list(l):

633

l.remove(x)

634

635

def filter(self, key, predicate):

636

l = self.items[key]

637

for x in list(l):

638

if predicate(x):

639

l.remove(x)

640

641

642

422

643

def __getitem__(self, key):

423

644

return self.items[key]

424

645

428

649

def __getattr__(self, term):

429

650

return self.items[term]

430

651

431

@apply

432

def _nsmap():

652

@dynamic_property

653

def _nsmap(self):

433

654

def fget(self):

434

655

nsmap = {}

435

656

for term in self.items:

438

659

return nsmap

439

660

return property(fget=fget)

440

661

441

@apply

442

def _opf1_nsmap():

662

@dynamic_property

663

def _opf1_nsmap(self):

443

664

def fget(self):

444

665

nsmap = self._nsmap

445

666

for key, value in nsmap.items():

448

669

return nsmap

449

670

return property(fget=fget)

450

671

451

452

@apply

453

def _opf2_nsmap():

672

@dynamic_property

673

def _opf2_nsmap(self):

454

674

def fget(self):

455

675

nsmap = self._nsmap

456

nsmap.update(self.OPF2_NSMAP)

676

nsmap.update(OPF2_NSMAP)

457

677

return nsmap

458

678

return property(fget=fget)

459

679

460

461

680

def to_opf1(self, parent=None):

462

681

nsmap = self._opf1_nsmap

463

682

nsrmap = dict((value, key) for key, value in nsmap.items())

464

683

elem = element(parent, 'metadata', nsmap=nsmap)

465

dcmeta = element(elem, 'dc-metadata', nsmap=self.OPF1_NSMAP)

684

dcmeta = element(elem, 'dc-metadata', nsmap=OPF1_NSMAP)

466

685

xmeta = element(elem, 'x-metadata')

467

686

for term in self.items:

468

687

for item in self.items[term]:

483

702

484

703

485

704

class Manifest(object):

705

"""Collection of files composing an OEB data model book.

706

707

Provides access to the content of the files composing the book and

708

attributes associated with those files, including their internal paths,

709

unique identifiers, and MIME types.

710

711

Itself acts as a :class:`set` of manifest items, and provides the following

712

instance data member for dictionary-like access:

713

714

:attr:`ids`: A dictionary in which the keys are the unique identifiers of

715

the manifest items and the values are the items themselves.

716

:attr:`hrefs`: A dictionary in which the keys are the internal paths of the

717

manifest items and the values are the items themselves.

718

"""

486

719

487

720

class Item(object):

721

"""An OEB data model book content file.

722

723

Provides the following data members for accessing the file content and

724

metadata associated with this particular file.

725

726

:attr:`id`: Unique identifier.

727

:attr:`href`: Book-internal path.

728

:attr:`media_type`: MIME type of the file content.

729

:attr:`fallback`: Unique id of any fallback manifest item associated

730

with this manifest item.

731

:attr:`spine_position`: Display/reading order index for book textual

732

content. `None` for manifest items which are not part of the

733

book's textual content.

734

:attr:`linear`: `True` for textual content items which are part of the

735

primary linear reading order and `False` for textual content items

736

which are not (such as footnotes). Meaningless for items which

737

have a :attr:`spine_position` of `None`.

738

"""

488

739

489

740

NUM_RE = re.compile('^(.*)([0-9][0-9.]*)(?=[.]|$)')

490

741

META_XP = XPath('/h:html/h:head/h:meta[@http-equiv="Content-Type"]')

504

755

self._data = data

505

756

506

757

def __repr__(self):

507

return 'Item(id=%r, href=%r, media_type=%r)' \

758

return u'Item(id=%r, href=%r, media_type=%r)' \

508

759

% (self.id, self.href, self.media_type)

509

760

510

def _force_xhtml(self, data):

761

def _parse_xml(self, data):

762

try:

763

return etree.fromstring(data)

764

except etree.XMLSyntaxError, err:

765

if getattr(err, 'code', 0) == 26 or str(err).startswith('Entity'):

766

data = xml_to_unicode(data, strip_encoding_pats=True,

767

resolve_entities=True)[0]

768

return etree.fromstring(data)

769

770

def _parse_xhtml(self, data):

771

self.oeb.log.debug('Parsing', self.href, '...')

511

772

# Convert to Unicode and normalize line endings

512

773

data = self.oeb.decode(data)

513

data = XMLDECL_RE.sub('', data)

514

# Handle broken XHTML w/ SVG (ugh)

515

if 'svg:' in data and SVG_NS not in data:

516

data = data.replace(

517

'<html', '<html xmlns:svg="%s"' % SVG_NS, 1)

518

if 'xlink:' in data and XLINK_NS not in data:

519

data = data.replace(

520

'<html', '<html xmlns:xlink="%s"' % XLINK_NS, 1)

774

data = self.oeb.html_preprocessor(data)

775

776

# Remove DOCTYPE declaration as it messes up parsing

777

# Inparticular it causes tostring to insert xmlns

778

# declarations, which messes up the coercing logic

779

idx = data.find('<html')

780

if idx > -1:

781

pre = data[:idx]

782

data = data[idx:]

783

if '<!DOCTYPE' in pre:

784

user_entities = {}

785

for match in re.finditer(r'<!ENTITY\s+(\S+)\s+([^>]+)', pre):

786

val = match.group(2)

787

if val.startswith('"') and val.endswith('"'):

788

val = val[1:-1]

789

user_entities[match.group(1)] = val

790

if user_entities:

791

pat = re.compile(r'&(%s);'%('|'.join(user_entities.keys())))

792

data = pat.sub(lambda m:user_entities[m.group(1)], data)

793

521

794

# Try with more & more drastic measures to parse

522

try:

523

data = etree.fromstring(data)

524

except etree.XMLSyntaxError:

525

repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))

526

data = ENTITY_RE.sub(repl, data)

795

def first_pass(data):

527

796

try:

528

797

data = etree.fromstring(data)

529

798

except etree.XMLSyntaxError:

530

# TODO: Factor out HTML->XML coercion

531

self.oeb.logger.warn('Parsing file %r as HTML' % self.href)

532

data = html.fromstring(data)

533

data.attrib.pop('xmlns', None)

534

for elem in data.iter(tag=etree.Comment):

535

if elem.text:

536

elem.text = elem.text.strip('-')

537

data = etree.tostring(data, encoding=unicode)

799

repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))

800

data = ENTITY_RE.sub(repl, data)

538

801

try:

539

802

data = etree.fromstring(data)

540

803

except etree.XMLSyntaxError:

541

data = etree.fromstring(data, parser=RECOVER_PARSER)

804

self.oeb.logger.warn('Parsing file %r as HTML' % self.href)

805

data = html.fromstring(data)

806

data.attrib.pop('xmlns', None)

807

for elem in data.iter(tag=etree.Comment):

808

if elem.text:

809

elem.text = elem.text.strip('-')

810

data = etree.tostring(data, encoding=unicode)

811

try:

812

data = etree.fromstring(data)

813

except etree.XMLSyntaxError:

814

data = etree.fromstring(data, parser=RECOVER_PARSER)

815

return data

816

data = first_pass(data)

817

818

# Handle weird (non-HTML/fragment) files

819

if barename(data.tag) != 'html':

820

self.oeb.log.warn('File %r does not appear to be (X)HTML'%self.href)

821

nroot = etree.fromstring('<html></html>')

822

has_body = False

823

for child in list(data):

824

if barename(child.tag) == 'body':

825

has_body = True

826

break

827

parent = nroot

828

if not has_body:

829

self.oeb.log.warn('File %r appears to be a HTML fragment'%self.href)

830

nroot = etree.fromstring('<html><body/></html>')

831

parent = nroot[0]

832

for child in list(data):

833

child.getparent().remove(child)

834

parent.append(child)

835

data = nroot

836

542

837

# Force into the XHTML namespace

543

if barename(data.tag) != 'html':

544

raise NotHTML(

545

'File %r does not appear to be (X)HTML' % self.href)

546

elif not namespace(data.tag):

838

if not namespace(data.tag):

547

839

data.attrib['xmlns'] = XHTML_NS

548

840

data = etree.tostring(data, encoding=unicode)

549

841

try:

550

842

data = etree.fromstring(data)

551

843

except:

552

844

data=data.replace(':=', '=').replace(':>', '>')

553

data = etree.fromstring(data)

845

try:

846

data = etree.fromstring(data)

847

except etree.XMLSyntaxError:

848

self.oeb.logger.warn('Stripping comments and meta tags from %s'%

849

self.href)

850

data = re.compile(r'', re.DOTALL).sub('',

851

data)

852

data = re.sub(r'<meta\s+[^>]+?>', '', data)

853

data = etree.fromstring(data)

554

854

elif namespace(data.tag) != XHTML_NS:

555

855

# OEB_DOC_NS, but possibly others

556

856

ns = namespace(data.tag)

564

864

for elem in data:

565

865

nroot.append(elem)

566

866

data = nroot

867

868

data = merge_multiple_html_heads_and_bodies(data, self.oeb.logger)

567

869

# Ensure has a <head/>

568

870

head = xpath(data, '/h:html/h:head')

569

871

head = head[0] if head else None

590

892

self.oeb.logger.warn(

591

893

'File %r missing <body/> element' % self.href)

592

894

etree.SubElement(data, XHTML('body'))

593

return data

594

595

@apply

596

def data():

895

896

# Remove microsoft office markup

897

r = [x for x in data.iterdescendants(etree.Element) if 'microsoft-com' in x.tag]

898

for x in r:

899

x.tag = XHTML('span')

900

901

# Remove lang redefinition inserted by the amazing Microsoft Word!

902

body = xpath(data, '/h:html/h:body')[0]

903

for key in list(body.attrib.keys()):

904

if key == 'lang' or key.endswith('}lang'):

905

body.attrib.pop(key)

906

907

return data

908

909

def _parse_txt(self, data):

910

if '<html>' in data:

911

return self._parse_xhtml(data)

912

913

self.oeb.log.debug('Converting', self.href, '...')

914

915

from calibre.ebooks.txt.processor import txt_to_markdown

916

917

title = self.oeb.metadata.title

918

if title:

919

title = unicode(title[0])

920

else:

921

title = _('Unknown')

922

923

return self._parse_xhtml(txt_to_markdown(data, title))

924

925

926

def _parse_css(self, data):

927

self.oeb.log.debug('Parsing', self.href, '...')

928

data = self.oeb.decode(data)

929

data = self.oeb.css_preprocessor(data)

930

data = XHTML_CSS_NAMESPACE + data

931

parser = CSSParser(loglevel=logging.WARNING,

932

fetcher=self._fetch_css,

933

log=_css_logger)

934

data = parser.parseString(data, href=self.href)

935

data.namespaces['h'] = XHTML_NS

936

return data

937

938

def _fetch_css(self, path):

939

hrefs = self.oeb.manifest.hrefs

940

if path not in hrefs:

941

self.oeb.logger.warn('CSS import of missing file %r' % path)

942

return (None, None)

943

item = hrefs[path]

944

if item.media_type not in OEB_STYLES:

945

self.oeb.logger.warn('CSS import of non-CSS file %r' % path)

946

return (None, None)

947

data = item.data.cssText

948

return ('utf-8', data)

949

950

@dynamic_property

951

def data(self):

952

doc = """Provides MIME type sensitive access to the manifest

953

entry's associated content.

954

955

- XHTML, HTML, and variant content is parsed as necessary to

956

convert and and return as an lxml.etree element in the XHTML

957

namespace.

958

- XML content is parsed and returned as an lxml.etree element.

959

- CSS and CSS-variant content is parsed and returned as a cssutils

960

CSS DOM stylesheet.

961

- All other content is returned as a :class:`str` object with no

962

special parsing.

963

"""

597

964

def fget(self):

598

if self._data is not None:

599

return self._data

600

data = self._loader(self.href)

601

if self.media_type in OEB_DOCS:

602

data = self._force_xhtml(data)

603

elif self.media_type[-4:] in ('+xml', '/xml'):

604

data = etree.fromstring(data)

605

elif self.media_type in OEB_STYLES:

606

data = self.oeb.decode(data)

965

data = self._data

966

if data is None:

967

if self._loader is None:

968

return None

969

data = self._loader(getattr(self, 'html_input_href',

970

self.href))

971

if not isinstance(data, basestring):

972

pass # already parsed

973

elif self.media_type.lower() in OEB_DOCS:

974

data = self._parse_xhtml(data)

975

elif self.media_type.lower()[-4:] in ('+xml', '/xml'):

976

data = self._parse_xml(data)

977

elif self.media_type.lower() in OEB_STYLES:

978

data = self._parse_css(data)

979

elif 'text' in self.media_type.lower():

980

self.oeb.log.warn('%s contains data in TXT format'%self.href,

981

'converting to HTML')

982

data = self._parse_txt(data)

983

self.media_type = XHTML_MIME

607

984

self._data = data

608

985

return data

609

986

def fset(self, value):

610

987

self._data = value

611

988

def fdel(self):

612

989

self._data = None

613

return property(fget, fset, fdel)

990

return property(fget, fset, fdel, doc=doc)

614

991

615

992

def __str__(self):

616

993

data = self.data

617

994

if isinstance(data, etree._Element):

618

return xml2str(data)

995

return xml2str(data, pretty_print=self.oeb.pretty_print)

619

996

if isinstance(data, unicode):

620

997

return data.encode('utf-8')

998

if hasattr(data, 'cssText'):

999

data = data.cssText

1000

if isinstance(data, unicode):

1001

data = data.encode('utf-8')

1002

return data

621

1003

return str(data)

622

1004

1005

def __unicode__(self):

1006

data = self.data

1007

if isinstance(data, etree._Element):

1008

return xml2unicode(data, pretty_print=self.oeb.pretty_print)

1009

if isinstance(data, unicode):

1010

return data

1011

if hasattr(data, 'cssText'):

1012

return data.cssText

1013

return unicode(data)

1014

623

1015

def __eq__(self, other):

624

1016

return id(self) == id(other)

625

1017

641

1033

return cmp(skey, okey)

642

1034

643

1035

def relhref(self, href):

1036

"""Convert the URL provided in :param:`href` from a book-absolute

1037

reference to a reference relative to this manifest item.

1038

"""

644

1039

if urlparse(href).scheme:

645

1040

return href

646

1041

if '/' not in self.href:

659

1054

return relhref

660

1055

661

1056

def abshref(self, href):

662

if urlparse(href).scheme:

1057

"""Convert the URL provided in :param:`href` from a reference

1058

relative to this manifest item to a book-absolute reference.

1059

"""

1060

purl = urlparse(href)

1061

scheme = purl.scheme

1062

if scheme and scheme != 'file':

663

1063

return href

1064

purl = list(purl)

1065

purl[0] = ''

1066

href = urlunparse(purl)

664

1067

path, frag = urldefrag(href)

665

1068

if not path:

666

return '#'.join((self.href, frag))

1069

if frag:

1070

return '#'.join((self.href, frag))

1071

else:

1072

return self.href

667

1073

if '/' not in self.href:

668

1074

return href

669

1075

dirname = os.path.dirname(self.href)

673

1079

674

1080

def __init__(self, oeb):

675

1081

self.oeb = oeb

1082

self.items = set()

676

1083

self.ids = {}

677

1084

self.hrefs = {}

678

1085

679

1086

def add(self, id, href, media_type, fallback=None, loader=None, data=None):

1087

"""Add a new item to the book manifest.

1088

1089

The item's :param:`id`, :param:`href`, and :param:`media_type` are all

1090

required. A :param:`fallback` item-id is required for any items with a

1091

MIME type which is not one of the OPS core media types. Either the

1092

item's data itself may be provided with :param:`data`, or a loader

1093

function for the data may be provided with :param:`loader`, or the

1094

item's data may later be set manually via the :attr:`data` attribute.

1095

"""

680

1096

item = self.Item(

681

1097

self.oeb, id, href, media_type, fallback, loader, data)

1098

self.items.add(item)

682

1099

self.ids[item.id] = item

683

1100

self.hrefs[item.href] = item

684

1101

return item

685

1102

686

1103

def remove(self, item):

1104

"""Removes :param:`item` from the manifest."""

687

1105

if item in self.ids:

688

1106

item = self.ids[item]

689

1107

del self.ids[item.id]

690

1108

del self.hrefs[item.href]

1109

self.items.remove(item)

691

1110

if item in self.oeb.spine:

692

1111

self.oeb.spine.remove(item)

693

1112

694

1113

def generate(self, id=None, href=None):

1114

"""Generate a new unique identifier and/or internal path for use in

1115

creating a new manifest item, using the provided :param:`id` and/or

1116

:param:`href` as bases.

1117

1118

Returns an two-tuple of the new id and path. If either :param:`id` or

1119

:param:`href` are `None` then the corresponding item in the return

1120

tuple will also be `None`.

1121

"""

695

1122

if id is not None:

696

1123

base = id

697

1124

index = 1

708

1135

return id, href

709

1136

710

1137

def __iter__(self):

711

for id in self.ids:

712

yield id

1138

for item in self.items:

1139

yield item

713

1140

714

def __getitem__(self, id):

715

return self.ids[id]

1141

def __len__(self):

1142

return len(self.items)

716

1143

717

1144

def values(self):

718

for item in self.ids.values():

719

yield item

720

721

def items(self):

722

for id, item in self.ids.items():

723

yield id, item

724

725

def __contains__(self, key):

726

return key in self.ids

1145

return list(self.items)

1146

1147

def __contains__(self, item):

1148

return item in self.items

727

1149

728

1150

def to_opf1(self, parent=None):

729

1151

elem = element(parent, 'manifest')

730

for item in self.ids.values():

1152

for item in self.items:

731

1153

media_type = item.media_type

732

1154

if media_type in OEB_DOCS:

733

1155

media_type = OEB_DOC_MIME

734

1156

elif media_type in OEB_STYLES:

735

1157

media_type = OEB_CSS_MIME

736

attrib = {'id': item.id, 'href': item.href,

1158

attrib = {'id': item.id, 'href': urlunquote(item.href),

737

1159

'media-type': media_type}

738

1160

if item.fallback:

739

1161

attrib['fallback'] = item.fallback

742

1164

743

1165

def to_opf2(self, parent=None):

744

1166

elem = element(parent, OPF('manifest'))

745

for item in self.ids.values():

1167

for item in self.items:

746

1168

media_type = item.media_type

747

1169

if media_type in OEB_DOCS:

748

1170

media_type = XHTML_MIME

749

1171

elif media_type in OEB_STYLES:

750

1172

media_type = CSS_MIME

751

attrib = {'id': item.id, 'href': item.href,

1173

attrib = {'id': item.id, 'href': urlunquote(item.href),

752

1174

'media-type': media_type}

753

1175

if item.fallback:

754

1176

attrib['fallback'] = item.fallback

757

1179

758

1180

759

1181

class Spine(object):

1182

"""Collection of manifest items composing an OEB data model book's main

1183

textual content.

760

1184

1185

The spine manages which manifest items compose the book's main textual

1186

content and the sequence in which they appear. Provides Python container

1187

access as a list-like object.

1188

"""

761

1189

def __init__(self, oeb):

762

1190

self.oeb = oeb

763

1191

self.items = []

764

1192

765

1193

def _linear(self, linear):

766

if isinstance(linear, StringTypes):

1194

if isinstance(linear, basestring):

767

1195

linear = linear.lower()

768

1196

if linear is None or linear in ('yes', 'true'):

769

1197

linear = True

772

1200

return linear

773

1201

774

1202

def add(self, item, linear=None):

1203

"""Append :param:`item` to the end of the `Spine`."""

775

1204

item.linear = self._linear(linear)

776

1205

item.spine_position = len(self.items)

777

1206

self.items.append(item)

778

1207

return item

779

1208

780

1209

def insert(self, index, item, linear):

1210

"""Insert :param:`item` at position :param:`index` in the `Spine`."""

781

1211

item.linear = self._linear(linear)

782

1212

item.spine_position = index

783

1213

self.items.insert(index, item)

786

1216

return item

787

1217

788

1218

def remove(self, item):

1219

"""Remove :param:`item` from the `Spine`."""

789

1220

index = item.spine_position

790

1221

self.items.pop(index)

791

1222

for i in xrange(index, len(self.items)):

792

1223

self.items[i].spine_position = i

793

1224

item.spine_position = None

794

1225

1226

def index(self, item):

1227

for i, x in enumerate(self):

1228

if item == x:

1229

return i

1230

return -1

1231

795

1232

def __iter__(self):

796

1233

for item in self.items:

797

1234

yield item

823

1260

824

1261

825

1262

class Guide(object):

1263

"""Collection of references to standard frequently-occurring sections

1264

within an OEB data model book.

1265

1266

Provides dictionary-like access, in which the keys are the OEB reference

1267

type identifiers and the values are `Reference` objects.

1268

"""

826

1269

827

1270

class Reference(object):

828

1271

"""Reference to a standard book section.

1272

1273

Provides the following instance data members:

1274

1275

:attr:`type`: Reference type identifier, as chosen from the list

1276

allowed in the OPF 2.0 specification.

1277

:attr:`title`: Human-readable section title.

1278

:attr:`href`: Book-internal URL of the referenced section. May include

1279

a fragment identifier.

1280

"""

829

1281

_TYPES_TITLES = [('cover', __('Cover')),

830

1282

('title-page', __('Title Page')),

831

1283

('toc', __('Table of Contents')),

845

1297

('text', __('Main Text'))]

846

1298

TYPES = set(t for t, _ in _TYPES_TITLES)

847

1299

TITLES = dict(_TYPES_TITLES)

848

ORDER = dict((t, i) for (t, _), i in izip(_TYPES_TITLES, count(0)))

1300

ORDER = dict((t, i) for i, (t, _) in enumerate(_TYPES_TITLES))

849

1301

850

1302

def __init__(self, oeb, type, title, href):

851

1303

self.oeb = oeb

864

1316

return 'Reference(type=%r, title=%r, href=%r)' \

865

1317

% (self.type, self.title, self.href)

866

1318

867

@apply

868

def _order():

1319

@dynamic_property

1320

def _order(self):

869

1321

def fget(self):

870

1322

return self.ORDER.get(self.type, self.type)

871

1323

return property(fget=fget)

875

1327

return NotImplemented

876

1328

return cmp(self._order, other._order)

877

1329

878

@apply

879

def item():

1330

@dynamic_property

1331

def item(self):

1332

doc = """The manifest item associated with this reference."""

880

1333

def fget(self):

881

1334

path = urldefrag(self.href)[0]

882

1335

hrefs = self.oeb.manifest.hrefs

883

1336

return hrefs.get(path, None)

884

return property(fget=fget)

1337

return property(fget=fget, doc=doc)

885

1338

886

1339

def __init__(self, oeb):

887

1340

self.oeb = oeb

888

1341

self.refs = {}

889

1342

890

1343

def add(self, type, title, href):

1344

"""Add a new reference to the `Guide`."""

891

1345

ref = self.Reference(self.oeb, type, title, href)

892

1346

self.refs[type] = ref

893

1347

return ref

894

1348

1349

def remove(self, type):

1350

return self.refs.pop(type, None)

1351

895

1352

def iterkeys(self):

896

1353

for type in self.refs:

897

1354

yield type

919

1376

def to_opf1(self, parent=None):

920

1377

elem = element(parent, 'guide')

921

1378

for ref in self.refs.values():

922

attrib = {'type': ref.type, 'href': ref.href}

1379

attrib = {'type': ref.type, 'href': urlunquote(ref.href)}

923

1380

if ref.title:

924

1381

attrib['title'] = ref.title

925

1382

element(elem, 'reference', attrib=attrib)

928

1385

def to_opf2(self, parent=None):

929

1386

elem = element(parent, OPF('guide'))

930

1387

for ref in self.refs.values():

931

attrib = {'type': ref.type, 'href': ref.href}

1388

attrib = {'type': ref.type, 'href': urlunquote(ref.href)}

932

1389

if ref.title:

933

1390

attrib['title'] = ref.title

934

1391

element(elem, OPF('reference'), attrib=attrib)

935

1392

return elem

936

1393

937

1394

1395

# TODO: This needs beefing up to support the interface of toc.TOC

938

1396

class TOC(object):

939

# This needs beefing up to support the interface of toc.TOC

940

def __init__(self, title=None, href=None, klass=None, id=None):

1397

"""Represents a hierarchical table of contents or navigation tree for

1398

accessing arbitrary semantic sections within an OEB data model book.

1399

1400

Acts as a node within the navigation tree. Provides list-like access to

1401

sub-nodes. Provides the follow node instance data attributes:

1402

1403

:attr:`title`: The title of this navigation node.

1404

:attr:`href`: Book-internal URL referenced by this node.

1405

:attr:`klass`: Optional semantic class referenced by this node.

1406

:attr:`id`: Option unique identifier for this node.

1407

:attr:`author`: Optional author attribution for periodicals <mbp:>

1408

:attr:`description`: Optional description attribute for periodicals <mbp:>

1409

"""

1410

def __init__(self, title=None, href=None, klass=None, id=None,

1411

play_order=None, author=None, description=None):

941

1412

self.title = title

942

1413

self.href = urlnormalize(href) if href else href

943

1414

self.klass = klass

944

1415

self.id = id

945

1416

self.nodes = []

1417

self.play_order = 0

1418

if play_order is None:

1419

play_order = self.next_play_order()

1420

self.play_order = play_order

1421

self.author = author

1422

self.description = description

946

1423

947

def add(self, title, href, klass=None, id=None):

948

node = TOC(title, href, klass, id)

1424

def add(self, title, href, klass=None, id=None, play_order=0, author=None, description=None):

1425

"""Create and return a new sub-node of this node."""

1426

node = TOC(title, href, klass, id, play_order, author, description)

949

1427

self.nodes.append(node)

950

1428

return node

951

1429

1430

def remove(self, node):

1431

for child in self.nodes:

1432

if child is node:

1433

self.nodes.remove(child)

1434

return True

1435

else:

1436

if child.remove(node):

1437

return True

1438

return False

1439

1440

def iter(self):

1441

"""Iterate over this node and all descendants in depth-first order."""

1442

yield self

1443

for child in self.nodes:

1444

for node in child.iter():

1445

yield node

1446

1447

def count(self):

1448

return len(list(self.iter())) - 1

1449

1450

def next_play_order(self):

1451

entries = [x.play_order for x in self.iter()]

1452

base = max(entries) if entries else 0

1453

return base+1

1454

1455

def has_href(self, href):

1456

for x in self.iter():

1457

if x.href == href:

1458

return True

1459

return False

1460

1461

def has_text(self, text):

1462

for x in self.iter():

1463

if x.title and x.title.lower() == text.lower():

1464

return True

1465

return False

1466

952

1467

def iterdescendants(self):

953

for node in self.nodes:

954

yield node

955

for child in node.iterdescendants():

956

yield child

1468

"""Iterate over all descendant nodes in depth-first order."""

1469

for child in self.nodes:

1470

for node in child.iter():

1471

yield node

957

1472

958

1473

def __iter__(self):

1474

"""Iterate over all immediate child nodes."""

959

1475

for node in self.nodes:

960

1476

yield node

961

1477

963

1479

return self.nodes[index]

964

1480

965

1481

def autolayer(self):

1482

"""Make sequences of children pointing to the same content file into

1483

children of the first node referencing that file.

1484

"""

966

1485

prev = None

967

1486

for node in list(self.nodes):

968

1487

if prev and urldefrag(prev.href)[0] == urldefrag(node.href)[0]:

971

1490

else:

972

1491

prev = node

973

1492

974

def depth(self, level=0):

975

if self.nodes:

976

return self.nodes[0].depth(level+1)

977

return level

1493

def depth(self):

1494

"""The maximum depth of the navigation tree rooted at this node."""

1495

try:

1496

return max(node.depth() for node in self.nodes) + 1

1497

except ValueError:

1498

return 1

1499

1500

def __str__(self):

1501

return 'TOC: %s --> %s'%(self.title, self.href)

1502

978

1503

979

1504

def to_opf1(self, tour):

980

1505

for node in self.nodes:

981

1506

element(tour, 'site', attrib={

982

'title': node.title, 'href': node.href})

1507

'title': node.title, 'href': urlunquote(node.href)})

983

1508

node.to_opf1(tour)

984

1509

return tour

985

1510

986

def to_ncx(self, parent, depth=1):

1511

def to_ncx(self, parent=None):

1512

if parent is None:

1513

parent = etree.Element(NCX('navMap'))

987

1514

for node in self.nodes:

988

1515

id = node.id or unicode(uuid.uuid4())

989

attrib = {'id': id, 'playOrder': '0'}

1516

attrib = {'id': id, 'playOrder': str(node.play_order)}

990

1517

if node.klass:

991

1518

attrib['class'] = node.klass

992

1519

point = element(parent, NCX('navPoint'), attrib=attrib)

993

1520

label = etree.SubElement(point, NCX('navLabel'))

994

1521

element(label, NCX('text')).text = node.title

995

href = node.href if depth > 1 else urldefrag(node.href)[0]

996

element(point, NCX('content'), src=href)

997

node.to_ncx(point, depth+1)

1522

element(point, NCX('content'), src=urlunquote(node.href))

1523

node.to_ncx(point)

998

1524

return parent

999

1525

1526

def rationalize_play_orders(self):

1527

'''

1528

Ensure that all nodes with the same play_order have the same href and

1529

with different play_orders have different hrefs.

1530

'''

1531

def po_node(n):

1532

for x in self.iter():

1533

if x is n:

1534

return

1535

if x.play_order == n.play_order:

1536

return x

1537

1538

def href_node(n):

1539

for x in self.iter():

1540

if x is n:

1541

return

1542

if x.href == n.href:

1543

return x

1544

1545

for x in self.iter():

1546

y = po_node(x)

1547

if y is not None:

1548

if x.href != y.href:

1549

x.play_order = getattr(href_node(x), 'play_order',

1550

self.next_play_order())

1551

y = href_node(x)

1552

if y is not None:

1553

x.play_order = y.play_order

1000

1554

1001

1555

class PageList(object):

1556

"""Collection of named "pages" to mapped positions within an OEB data model

1557

book's textual content.

1558

1559

Provides list-like access to the pages.

1560

"""

1002

1561

1003

1562

class Page(object):

1563

"""Represents a mapping between a page name and a position within

1564

the book content.

1565

1566

Provides the following instance data attributes:

1567

1568

:attr:`name`: The name of this page. Generally a number.

1569

:attr:`href`: Book-internal URL at which point this page begins.

1570

:attr:`type`: Must be one of 'front' (for prefatory pages, as commonly

1571

labeled in print with small-case Roman numerals), 'normal' (for

1572

standard pages, as commonly labeled in print with Arabic numerals),

1573

or 'special' (for other pages, as commonly not labeled in any

1574

fashion in print, such as the cover and title pages).

1575

:attr:`klass`: Optional semantic class of this page.

1576

:attr:`id`: Optional unique identifier for this page.

1577

"""

1578

TYPES = set(['front', 'normal', 'special'])

1579

1004

1580

def __init__(self, name, href, type='normal', klass=None, id=None):

1005

self.name = name

1581

self.name = unicode(name)

1006

1582

self.href = urlnormalize(href)

1007

self.type = type

1583

self.type = type if type in self.TYPES else 'normal'

1008

1584

self.id = id

1009

1585

self.klass = klass

1010

1586

1012

1588

self.pages = []

1013

1589

1014

1590

def add(self, name, href, type='normal', klass=None, id=None):

1591

"""Create a new page and add it to the `PageList`."""

1015

1592

page = self.Page(name, href, type, klass, id)

1016

1593

self.pages.append(page)

1017

1594

return page

1026

1603

def __getitem__(self, index):

1027

1604

return self.pages[index]

1028

1605

1606

def pop(self, index=-1):

1607

return self.pages.pop(index)

1608

1609

def remove(self, page):

1610

return self.pages.remove(page)

1611

1029

1612

def to_ncx(self, parent=None):

1030

1613

plist = element(parent, NCX('pageList'), id=str(uuid.uuid4()))

1031

1614

values = dict((t, count(1)) for t in ('front', 'normal', 'special'))

1050

1633

1051

1634

1052

1635

class OEBBook(object):

1636

"""Representation of a book in the IDPF OEB data model."""

1053

1637

1054

1638

COVER_SVG_XP = XPath('h:body//svg:svg[position() = 1]')

1055

1639

COVER_OBJECT_XP = XPath('h:body//h:object[@data][position() = 1]')

1056

1640

1057

def __init__(self, opfpath=None, container=None, encoding=None,

1058

logger=FauxLogger()):

1059

if opfpath and not container:

1060

container = DirContainer(os.path.dirname(opfpath))

1061

opfpath = os.path.basename(opfpath)

1062

self.container = container

1641

def __init__(self, logger,

1642

html_preprocessor,

1643

css_preprocessor=CSSPreProcessor(),

1644

encoding='utf-8', pretty_print=False,

1645

input_encoding='utf-8'):

1646

"""Create empty book. Arguments:

1647

1648

:param:`encoding`: Default encoding for textual content read

1649

from an external container.

1650

:param:`pretty_print`: Whether or not the canonical string form

1651

of XML markup is pretty-printed.

1652

:param html_preprocessor: A callable that takes a unicode object

1653

and returns a unicode object. Will be called on all html files

1654

before they are parsed.

1655

:param css_preprocessor: A callable that takes a unicode object

1656

and returns a unicode object. Will be called on all CSS files

1657

before they are parsed.

1658

:param:`logger`: A Log object to use for logging all messages

1659

related to the processing of this book. It is accessible

1660

via the instance data members :attr:`logger,log`.

1661

1662

It provides the following public instance data members for

1663

accessing various parts of the OEB data model:

1664

1665

:attr:`metadata`: Metadata such as title, author name(s), etc.

1666

:attr:`manifest`: Manifest of all files included in the book,

1667

including MIME types and fallback information.

1668

:attr:`spine`: In-order list of manifest items which compose

1669

the textual content of the book.

1670

:attr:`guide`: Collection of references to standard positions

1671

within the text, such as the cover, preface, etc.

1672

:attr:`toc`: Hierarchical table of contents.

1673

:attr:`pages`: List of "pages," such as indexed to a print edition of

1674

the same text.

1675

"""

1676

_css_log_handler.log = logger

1063

1677

self.encoding = encoding

1064

self.logger = logger

1065

if opfpath or container:

1066

opf = self._read_opf(opfpath)

1067

self._all_from_opf(opf)

1068

1069

def _clean_opf(self, opf):

1070

nsmap = {}

1071

for elem in opf.iter(tag=etree.Element):

1072

nsmap.update(elem.nsmap)

1073

for elem in opf.iter(tag=etree.Element):

1074

if namespace(elem.tag) in ('', OPF1_NS):

1075

elem.tag = OPF(barename(elem.tag))

1076

nsmap.update(Metadata.OPF2_NSMAP)

1077

attrib = dict(opf.attrib)

1078

nroot = etree.Element(OPF('package'),

1079

nsmap={None: OPF2_NS}, attrib=attrib)

1080

metadata = etree.SubElement(nroot, OPF('metadata'), nsmap=nsmap)

1081

ignored = (OPF('dc-metadata'), OPF('x-metadata'))

1082

for elem in xpath(opf, 'o2:metadata//*'):

1083

if elem.tag in ignored:

1084

continue

1085

if namespace(elem.tag) in DC_NSES:

1086

tag = barename(elem.tag).lower()

1087

elem.tag = '{%s}%s' % (DC11_NS, tag)

1088

metadata.append(elem)

1089

for element in xpath(opf, 'o2:metadata//o2:meta'):

1090

metadata.append(element)

1091

for tag in ('o2:manifest', 'o2:spine', 'o2:tours', 'o2:guide'):

1092

for element in xpath(opf, tag):

1093

nroot.append(element)

1094

return nroot

1095

1096

def _read_opf(self, opfpath):

1097

data = self.container.read(opfpath)

1098

data = self.decode(data)

1099

data = XMLDECL_RE.sub('', data)

1100

try:

1101

opf = etree.fromstring(data)

1102

except etree.XMLSyntaxError:

1103

repl = lambda m: ENTITYDEFS.get(m.group(1), m.group(0))

1104

data = ENTITY_RE.sub(repl, data)

1105

opf = etree.fromstring(data)

1106

self.logger.warn('OPF contains invalid HTML named entities')

1107

ns = namespace(opf.tag)

1108

if ns not in ('', OPF1_NS, OPF2_NS):

1109

raise OEBError('Invalid namespace %r for OPF document' % ns)

1110

opf = self._clean_opf(opf)

1111

return opf

1112

1113

def _metadata_from_opf(self, opf):

1114

uid = opf.get('unique-identifier', None)

1678

self.input_encoding = input_encoding

1679

self.html_preprocessor = html_preprocessor

1680

self.css_preprocessor = css_preprocessor

1681

self.pretty_print = pretty_print

1682

self.logger = self.log = logger

1683

self.version = '2.0'

1684

self.container = NullContainer(self.log)

1685

self.metadata = Metadata(self)

1115

1686

self.uid = None

1116

self.metadata = metadata = Metadata(self)

1117

for elem in xpath(opf, '/o2:package/o2:metadata//*'):

1118

term = elem.tag

1119

value = elem.text

1120

attrib = dict(elem.attrib)

1121

nsmap = elem.nsmap

1122

if term == OPF('meta'):

1123

term = qname(attrib.pop('name', None), nsmap)

1124

value = attrib.pop('content', None)

1125

if value:

1126

value = COLLAPSE_RE.sub(' ', value.strip())

1127

if term and (value or attrib):

1128

metadata.add(term, value, attrib, nsmap=nsmap)

1129

haveuuid = haveid = False

1130

for ident in metadata.identifier:

1131

if unicode(ident).startswith('urn:uuid:'):

1132

haveuuid = True

1133

if 'id' in ident.attrib:

1134

haveid = True

1135

if not (haveuuid and haveid):

1136

bookid = "urn:uuid:%s" % str(uuid.uuid4())

1137

metadata.add('identifier', bookid, id='calibre-uuid')

1138

if uid is None:

1139

self.logger.warn(u'Unique-identifier not specified')

1140

for item in metadata.identifier:

1141

if not item.id:

1142

continue

1143

if uid is None or item.id == uid:

1144

self.uid = item

1145

break

1146

else:

1147

self.logger.warn(u'Unique-identifier %r not found' % uid)

1148

for ident in metadata.identifier:

1149

if 'id' in ident.attrib:

1150

self.uid = metadata.identifier[0]

1151

break

1152

if not metadata.language:

1153

self.logger.warn(u'Language not specified')

1154

metadata.add('language', get_lang())

1155

if not metadata.creator:

1156

self.logger.warn('Creator not specified')

1157

metadata.add('creator', self.translate(__('Unknown')))

1158

if not metadata.title:

1159

self.logger.warn('Title not specified')

1160

metadata.add('title', self.translate(__('Unknown')))

1161

1162

def _manifest_add_missing(self):

1163

manifest = self.manifest

1164

known = set(manifest.hrefs)

1165

unchecked = set(manifest.values())

1166

while unchecked:

1167

new = set()

1168

for item in unchecked:

1169

if (item.media_type in OEB_DOCS or

1170

item.media_type[-4:] in ('/xml', '+xml')) and \

1171

item.data is not None:

1172

hrefs = [sel(item.data) for sel in LINK_SELECTORS]

1173

for href in chain(*hrefs):

1174

href, _ = urldefrag(href)

1175

if not href:

1176

continue

1177

href = item.abshref(urlnormalize(href))

1178

scheme = urlparse(href).scheme

1179

if not scheme and href not in known:

1180

new.add(href)

1181

elif item.media_type in OEB_STYLES:

1182

for match in CSSURL_RE.finditer(item.data):

1183

href, _ = urldefrag(match.group('url'))

1184

href = item.abshref(urlnormalize(href))

1185

scheme = urlparse(href).scheme

1186

if not scheme and href not in known:

1187

new.add(href)

1188

unchecked.clear()

1189

for href in new:

1190

known.add(href)

1191

if not self.container.exists(href):

1192

self.logger.warn('Referenced file %r not found' % href)

1193

continue

1194

self.logger.warn('Referenced file %r not in manifest' % href)

1195

id, _ = manifest.generate(id='added')

1196

guessed = guess_type(href)[0]

1197

media_type = guessed or BINARY_MIME

1198

added = manifest.add(id, href, media_type)

1199

unchecked.add(added)

1200

1201

def _manifest_from_opf(self, opf):

1202

self.manifest = manifest = Manifest(self)

1203

for elem in xpath(opf, '/o2:package/o2:manifest/o2:item'):

1204

id = elem.get('id')

1205

href = elem.get('href')

1206

media_type = elem.get('media-type', None)

1207

if media_type is None:

1208

media_type = elem.get('mediatype', None)

1209

if media_type is None or media_type == 'text/xml':

1210

guessed = guess_type(href)[0]

1211

media_type = guessed or media_type or BINARY_MIME

1212

fallback = elem.get('fallback')

1213

if href in manifest.hrefs:

1214

self.logger.warn(u'Duplicate manifest entry for %r' % href)

1215

continue

1216

if not self.container.exists(href):

1217

self.logger.warn(u'Manifest item %r not found' % href)

1218

continue

1219

if id in manifest.ids:

1220

self.logger.warn(u'Duplicate manifest id %r' % id)

1221

id, href = manifest.generate(id, href)

1222

manifest.add(id, href, media_type, fallback)

1223

self._manifest_add_missing()

1224

1225

def _spine_add_extra(self):

1226

manifest = self.manifest

1227

spine = self.spine

1228

unchecked = set(spine)

1229

selector = XPath('h:body//h:a/@href')

1230

extras = set()

1231

while unchecked:

1232

new = set()

1233

for item in unchecked:

1234

if item.media_type not in OEB_DOCS:

1235

# TODO: handle fallback chains

1236

continue

1237

for href in selector(item.data):

1238

href, _ = urldefrag(href)

1239

if not href:

1240

continue

1241

href = item.abshref(urlnormalize(href))

1242

if href not in manifest.hrefs:

1243

continue

1244

found = manifest.hrefs[href]

1245

if found.media_type not in OEB_DOCS or \

1246

found in spine or found in extras:

1247

continue

1248

new.add(found)

1249

extras.update(new)

1250

unchecked = new

1251

version = int(self.version[0])

1252

for item in sorted(extras):

1253

if version >= 2:

1254

self.logger.warn(

1255

'Spine-referenced file %r not in spine' % item.href)

1256

spine.add(item, linear=False)

1257

1258

def _spine_from_opf(self, opf):

1259

self.spine = spine = Spine(self)

1260

for elem in xpath(opf, '/o2:package/o2:spine/o2:itemref'):

1261

idref = elem.get('idref')

1262

if idref not in self.manifest:

1263

self.logger.warn(u'Spine item %r not found' % idref)

1264

continue

1265

item = self.manifest[idref]

1266

spine.add(item, elem.get('linear'))

1267

if len(spine) == 0:

1268

raise OEBError("Spine is empty")

1269

self._spine_add_extra()

1270

1271

def _guide_from_opf(self, opf):

1272

self.guide = guide = Guide(self)

1273

for elem in xpath(opf, '/o2:package/o2:guide/o2:reference'):

1274

href = elem.get('href')

1275

path = urldefrag(href)[0]

1276

if path not in self.manifest.hrefs:

1277

self.logger.warn(u'Guide reference %r not found' % href)

1278

continue

1279

guide.add(elem.get('type'), elem.get('title'), href)

1280

1281

def _find_ncx(self, opf):

1282

result = xpath(opf, '/o2:package/o2:spine/@toc')

1283

if result:

1284

id = result[0]

1285

if id not in self.manifest.ids:

1286

return None

1287

item = self.manifest.ids[id]

1288

self.manifest.remove(item)

1289

return item

1290

for item in self.manifest.values():

1291

if item.media_type == NCX_MIME:

1292

self.manifest.remove(item)

1293

return item

1294

return None

1295

1296

def _toc_from_navpoint(self, item, toc, navpoint):

1297

children = xpath(navpoint, 'ncx:navPoint')

1298

for child in children:

1299

title = ''.join(xpath(child, 'ncx:navLabel/ncx:text/text()'))

1300

title = COLLAPSE_RE.sub(' ', title.strip())

1301

href = xpath(child, 'ncx:content/@src')

1302

if not title or not href:

1303

continue

1304

href = item.abshref(urlnormalize(href[0]))

1305

path, _ = urldefrag(href)

1306

if path not in self.manifest.hrefs:

1307

self.logger.warn('TOC reference %r not found' % href)

1308

continue

1309

id = child.get('id')

1310

klass = child.get('class')

1311

node = toc.add(title, href, id=id, klass=klass)

1312

self._toc_from_navpoint(item, node, child)

1313

1314

def _toc_from_ncx(self, item):

1315

if item is None:

1316

return False

1317

ncx = item.data

1318

title = ''.join(xpath(ncx, 'ncx:docTitle/ncx:text/text()'))

1319

title = COLLAPSE_RE.sub(' ', title.strip())

1320

title = title or unicode(self.metadata.title[0])

1321

self.toc = toc = TOC(title)

1322

navmaps = xpath(ncx, 'ncx:navMap')

1323

for navmap in navmaps:

1324

self._toc_from_navpoint(item, toc, navmap)

1325

return True

1326

1327

def _toc_from_tour(self, opf):

1328

result = xpath(opf, 'o2:tours/o2:tour')

1329

if not result:

1330

return False

1331

tour = result[0]

1332

self.toc = toc = TOC(tour.get('title'))

1333

sites = xpath(tour, 'o2:site')

1334

for site in sites:

1335

title = site.get('title')

1336

href = site.get('href')

1337

if not title or not href:

1338

continue

1339

path, _ = urldefrag(urlnormalize(href))

1340

if path not in self.manifest.hrefs:

1341

self.logger.warn('TOC reference %r not found' % href)

1342

continue

1343

id = site.get('id')

1344

toc.add(title, href, id=id)

1345

return True

1346

1347

def _toc_from_html(self, opf):

1348

if 'toc' not in self.guide:

1349

return False

1350

self.toc = toc = TOC()

1351

itempath, frag = urldefrag(self.guide['toc'].href)

1352

item = self.manifest.hrefs[itempath]

1353

html = item.data

1354

if frag:

1355

elems = xpath(html, './/*[@id="%s"]' % frag)

1356

if not elems:

1357

elems = xpath(html, './/*[@name="%s"]' % frag)

1358

elem = elems[0] if elems else html

1359

while elem != html and not xpath(elem, './/h:a[@href]'):

1360

elem = elem.getparent()

1361

html = elem

1362

titles = defaultdict(list)

1363

order = []

1364

for anchor in xpath(html, './/h:a[@href]'):

1365

href = anchor.attrib['href']

1366

href = item.abshref(urlnormalize(href))

1367

path, frag = urldefrag(href)

1368

if path not in self.manifest.hrefs:

1369

continue

1370

title = ' '.join(xpath(anchor, './/text()'))

1371

title = COLLAPSE_RE.sub(' ', title.strip())

1372

if href not in titles:

1373

order.append(href)

1374

titles[href].append(title)

1375

for href in order:

1376

toc.add(' '.join(titles[href]), href)

1377

return True

1378

1379

def _toc_from_spine(self, opf):

1380

self.toc = toc = TOC()

1381

titles = []

1382

headers = []

1383

for item in self.spine:

1384

if not item.linear: continue

1385

html = item.data

1386

title = ''.join(xpath(html, '/h:html/h:head/h:title/text()'))

1387

title = COLLAPSE_RE.sub(' ', title.strip())

1388

if title:

1389

titles.append(title)

1390

headers.append('(unlabled)')

1391

for tag in ('h1', 'h2', 'h3', 'h4', 'h5', 'strong'):

1392

expr = '/h:html/h:body//h:%s[position()=1]/text()'

1393

header = ''.join(xpath(html, expr % tag))

1394

header = COLLAPSE_RE.sub(' ', header.strip())

1395

if header:

1396

headers[-1] = header

1397

break

1398

use = titles

1399

if len(titles) > len(set(titles)):

1400

use = headers

1401

for title, item in izip(use, self.spine):

1402

if not item.linear: continue

1403

toc.add(title, item.href)

1404

return True

1405

1406

def _toc_from_opf(self, opf, item):

1407

if self._toc_from_ncx(item): return

1408

if self._toc_from_tour(opf): return

1409

self.logger.warn('No metadata table of contents found')

1410

if self._toc_from_html(opf): return

1411

self._toc_from_spine(opf)

1412

1413

def _pages_from_ncx(self, opf, item):

1414

if item is None:

1415

return False

1416

ncx = item.data

1417

ptargets = xpath(ncx, 'ncx:pageList/ncx:pageTarget')

1418

if not ptargets:

1419

return False

1420

pages = self.pages = PageList()

1421

for ptarget in ptargets:

1422

name = ''.join(xpath(ptarget, 'ncx:navLabel/ncx:text/text()'))

1423

name = COLLAPSE_RE.sub(' ', name.strip())

1424

href = xpath(ptarget, 'ncx:content/@src')

1425

if not href:

1426

continue

1427

href = item.abshref(urlnormalize(href[0]))

1428

id = ptarget.get('id')

1429

type = ptarget.get('type', 'normal')

1430

klass = ptarget.get('class')

1431

pages.add(name, href, type=type, id=id, klass=klass)

1432

return True

1433

1434

def _find_page_map(self, opf):

1435

result = xpath(opf, '/o2:package/o2:spine/@page-map')

1436

if result:

1437

id = result[0]

1438

if id not in self.manifest.ids:

1439

return None

1440

item = self.manifest.ids[id]

1441

self.manifest.remove(item)

1442

return item

1443

for item in self.manifest.values():

1444

if item.media_type == PAGE_MAP_MIME:

1445

self.manifest.remove(item)

1446

return item

1447

return None

1448

1449

def _pages_from_page_map(self, opf):

1450

item = self._find_page_map(opf)

1451

if item is None:

1452

return False

1453

pmap = item.data

1454

pages = self.pages = PageList()

1455

for page in xpath(pmap, 'o2:page'):

1456

name = page.get('name', '')

1457

href = page.get('href')

1458

if not href:

1459

continue

1460

name = COLLAPSE_RE.sub(' ', name.strip())

1461

href = item.abshref(urlnormalize(href))

1462

type = 'normal'

1463

if not name:

1464

type = 'special'

1465

elif name.lower().strip('ivxlcdm') == '':

1466

type = 'front'

1467

pages.add(name, href, type=type)

1468

return True

1469

1470

def _pages_from_opf(self, opf, item):

1471

if self._pages_from_ncx(opf, item): return

1472

if self._pages_from_page_map(opf): return

1687

self.manifest = Manifest(self)

1688

self.spine = Spine(self)

1689

self.guide = Guide(self)

1690

self.toc = TOC()

1473

1691

self.pages = PageList()

1474

return

1475

1476

def _cover_from_html(self, hcover):

1477

with TemporaryDirectory('_html_cover') as tdir:

1478

writer = DirWriter()

1479

writer.dump(self, tdir)

1480

path = os.path.join(tdir, urlunquote(hcover.href))

1481

renderer = CoverRenderer(path)

1482

data = renderer.image_data

1483

id, href = self.manifest.generate('cover', 'cover.jpeg')

1484

item = self.manifest.add(id, href, JPEG_MIME, data=data)

1485

return item

1486

1487

def _locate_cover_image(self):

1488

if self.metadata.cover:

1489

id = str(self.metadata.cover[0])

1490

item = self.manifest.ids.get(id, None)

1491

if item is not None and item.media_type in OEB_IMAGES:

1492

return item

1493

else:

1494

self.logger.warn('Invalid cover image @id %r' % id)

1495

hcover = self.spine[0]

1496

if 'cover' in self.guide:

1497

href = self.guide['cover'].href

1498

item = self.manifest.hrefs[href]

1499

media_type = item.media_type

1500

if media_type in OEB_IMAGES:

1501

return item

1502

elif media_type in OEB_DOCS:

1503

hcover = item

1504

html = hcover.data

1505

if MS_COVER_TYPE in self.guide:

1506

href = self.guide[MS_COVER_TYPE].href

1507

item = self.manifest.hrefs.get(href, None)

1508

if item is not None and item.media_type in OEB_IMAGES:

1509

return item

1510

if self.COVER_SVG_XP(html):

1511

svg = copy.deepcopy(self.COVER_SVG_XP(html)[0])

1512

href = os.path.splitext(hcover.href)[0] + '.svg'

1513

id, href = self.manifest.generate(hcover.id, href)

1514

item = self.manifest.add(id, href, SVG_MIME, data=svg)

1515

return item

1516

if self.COVER_OBJECT_XP(html):

1517

object = self.COVER_OBJECT_XP(html)[0]

1518

href = hcover.abshref(object.get('data'))

1519

item = self.manifest.hrefs.get(href, None)

1520

if item is not None and item.media_type in OEB_IMAGES:

1521

return item

1522

return self._cover_from_html(hcover)

1523

1524

def _ensure_cover_image(self):

1525

cover = self._locate_cover_image()

1526

if self.metadata.cover:

1527

self.metadata.cover[0].value = cover.id

1528

return

1529

self.metadata.add('cover', cover.id)

1530

1531

def _all_from_opf(self, opf):

1532

self.version = opf.get('version', '1.2')

1533

self._metadata_from_opf(opf)

1534

self._manifest_from_opf(opf)

1535

self._spine_from_opf(opf)

1536

self._guide_from_opf(opf)

1537

item = self._find_ncx(opf)

1538

self._toc_from_opf(opf, item)

1539

self._pages_from_opf(opf, item)

1540

self._ensure_cover_image()

1692

self.auto_generated_toc = True

1693

1694

@classmethod

1695

def generate(cls, opts):

1696

"""Generate an OEBBook instance from command-line options."""

1697

encoding = opts.encoding

1698

pretty_print = opts.pretty_print

1699

return cls(encoding=encoding, pretty_print=pretty_print)

1541

1700

1542

1701

def translate(self, text):

1702

"""Translate :param:`text` into the book's primary language."""

1543

1703

lang = str(self.metadata.language[0])

1544

1704

lang = lang.split('-', 1)[0].lower()

1545

1705

return translate(lang, text)

1546

1706

1547

1707

def decode(self, data):

1708

"""Automatically decode :param:`data` into a `unicode` object."""

1709

def fix_data(d):

1710

return d.replace('\r\n', '\n').replace('\r', '\n')

1548

1711

if isinstance(data, unicode):

1549

return data

1712

return fix_data(data)

1550

1713

if data[:2] in ('\xff\xfe', '\xfe\xff'):

1551

1714

try:

1552

return data.decode('utf-16')

1715

return fix_data(data.decode('utf-16'))

1716

except UnicodeDecodeError:

1717

pass

1718

if self.input_encoding is not None:

1719

try:

1720

return fix_data(data.decode(self.input_encoding, 'replace'))

1553

1721

except UnicodeDecodeError:

1554

1722

pass

1555

1723

try:

1556

return data.decode('utf-8')

1724

return fix_data(data.decode('utf-8'))

1557

1725

except UnicodeDecodeError:

1558

1726

pass

1559

if self.encoding is not None:

1560

try:

1561

return data.decode(self.encoding)

1562

except UnicodeDecodeError:

1563

pass

1564

1727

data, _ = xml_to_unicode(data)

1565

data = data.replace('\r\n', '\n')

1566

data = data.replace('\r', '\n')

1567

return data

1728

return fix_data(data)

1568

1729

1569

1730

def to_opf1(self):

1731

"""Produce OPF 1.2 representing the book's metadata and structure.

1732

1733

Returns a dictionary in which the keys are MIME types and the values

1734

are tuples of (default) filenames and lxml.etree element structures.

1735

"""

1570

1736

package = etree.Element('package',

1571

1737

attrib={'unique-identifier': self.uid.id})

1572

1738

self.metadata.to_opf1(package)

1638

1804

return ncx

1639

1805

1640

1806

def to_opf2(self, page_map=False):

1807

"""Produce OPF 2.0 representing the book's metadata and structure.

1808

1809

Returns a dictionary in which the keys are MIME types and the values

1810

are tuples of (default) filenames and lxml.etree element structures.

1811

"""

1641

1812

results = {}

1642

1813

package = etree.Element(OPF('package'),

1643

1814

attrib={'version': '2.0', 'unique-identifier': self.uid.id},

1659

1830

spine.attrib['page-map'] = id

1660

1831

results[PAGE_MAP_MIME] = (href, self.pages.to_page_map())

1661

1832

return results

1662

1663

1664

def main(argv=sys.argv):

1665

for arg in argv[1:]:

1666

oeb = OEBBook(arg)

1667

for name, doc in oeb.to_opf1().values():

1668

print etree.tostring(doc, pretty_print=True)

1669

for name, doc in oeb.to_opf2(page_map=True).values():

1670

print etree.tostring(doc, pretty_print=True)

1671

return 0

1672

1673

if __name__ == '__main__':

1674

sys.exit(main())

Older »