From ea09d020fa87c4b025c5fc0cacb1f82d6bd64bd7 Mon Sep 17 00:00:00 2001 From: Kunal Mehta Date: Fri, 17 Jul 2020 08:47:57 +0100 Subject: [PATCH] Import zimlib_6.1.8.orig.tar.gz [dgit import orig zimlib_6.1.8.orig.tar.gz] --- .codecov.yml | 17 + .github/FUNDING.yml | 12 + .github/move.yml | 27 + .github/workflows/ci.yml | 156 ++ .github/workflows/package.yml | 82 ++ .gitignore | 34 + AUTHORS | 1 + COPYING | 280 ++++ ChangeLog | 294 ++++ README.md | 147 ++ debian/changelog | 5 + debian/control | 69 + debian/copyright | 1 + debian/libzim-dev.install | 3 + debian/libzim6.install | 1 + debian/rules | 7 + debian/source/format | 1 + examples/createZimExample.cpp | 109 ++ examples/meson.build | 6 + include/meson.build | 23 + include/zim/article.h | 102 ++ include/zim/blob.h | 64 + include/zim/error.h | 38 + include/zim/file.h | 102 ++ include/zim/fileheader.h | 123 ++ include/zim/fileiterator.h | 118 ++ include/zim/search.h | 82 ++ include/zim/search_iterator.h | 69 + include/zim/uuid.h | 57 + include/zim/writer/article.h | 58 + include/zim/writer/creator.h | 71 + include/zim/writer/url.h | 64 + include/zim/zim.h | 66 + meson.build | 83 ++ meson_options.txt | 14 + scripts/libzim-compile-resources | 201 +++ scripts/meson.build | 2 + src/_dirent.h | 147 ++ src/article.cpp | 288 ++++ src/blob.cpp | 49 + src/buffer.cpp | 76 + src/buffer.h | 125 ++ src/cache.h | 345 +++++ src/cluster.cpp | 143 ++ src/cluster.h | 71 + src/compression.cpp | 224 +++ src/compression.h | 285 ++++ src/config.h.in | 20 + src/debug.h | 60 + src/dirent.cpp | 126 ++ src/endian_tools.h | 88 ++ src/envvalue.cpp | 58 + src/envvalue.h | 29 + src/file.cpp | 312 ++++ src/file_compound.cpp | 111 ++ src/file_compound.h | 72 + src/file_part.h | 60 + src/file_reader.cpp | 273 ++++ src/file_reader.h | 114 ++ src/fileheader.cpp | 131 ++ src/fileimpl.cpp | 617 ++++++++ src/fileimpl.h | 122 ++ src/fs.h | 38 + src/fs_unix.cpp | 137 ++ src/fs_unix.h | 90 ++ src/fs_windows.cpp | 199 +++ src/fs_windows.h | 76 + src/levenshtein.cpp | 31 + src/levenshtein.h | 9 + src/log.h | 37 + src/md5.c | 340 +++++ src/md5.h | 107 ++ src/meson.build | 74 + src/search.cpp | 443 ++++++ src/search_internal.h | 97 ++ src/search_iterator.cpp | 239 +++ src/template.cpp | 142 ++ src/template.h | 82 ++ src/tools.cpp | 77 + src/tools.h | 32 + src/uuid.cpp | 102 ++ src/writer/_dirent.h | 179 +++ src/writer/article.cpp | 43 + src/writer/cluster.cpp | 303 ++++ src/writer/cluster.h | 111 ++ src/writer/creator.cpp | 641 ++++++++ src/writer/creatordata.h | 149 ++ src/writer/dirent.cpp | 77 + src/writer/direntPool.h | 60 + src/writer/queue.h | 111 ++ src/writer/workers.cpp | 192 +++ src/writer/workers.h | 83 ++ src/writer/xapianIndexer.cpp | 164 +++ src/writer/xapianIndexer.h | 110 ++ src/xapian/htmlparse.cc | 377 +++++ src/xapian/htmlparse.h | 53 + src/xapian/myhtmlparse.cc | 322 ++++ src/xapian/myhtmlparse.h | 75 + src/xapian/namedentities.h | 279 ++++ src/zim_types.h | 105 ++ static/meson.build | 12 + static/resources_list.txt | 57 + static/stopwords/af | 51 + static/stopwords/ar | 480 ++++++ static/stopwords/bg | 518 +++++++ static/stopwords/bn | 398 +++++ static/stopwords/br | 126 ++ static/stopwords/ca | 278 ++++ static/stopwords/cs | 550 +++++++ static/stopwords/da | 170 +++ static/stopwords/de | 621 ++++++++ static/stopwords/el | 265 ++++ static/stopwords/en | 1298 +++++++++++++++++ static/stopwords/eo | 173 +++ static/stopwords/es | 732 ++++++++++ static/stopwords/et | 35 + static/stopwords/eu | 98 ++ static/stopwords/fa | 799 ++++++++++ static/stopwords/fi | 847 +++++++++++ static/stopwords/fr | 689 +++++++++ static/stopwords/ga | 109 ++ static/stopwords/gl | 160 ++ static/stopwords/ha | 39 + static/stopwords/he | 194 +++ static/stopwords/hi | 225 +++ static/stopwords/hr | 179 +++ static/stopwords/hu | 1185 +++++++++++++++ static/stopwords/hy | 45 + static/stopwords/id | 758 ++++++++++ static/stopwords/it | 660 +++++++++ static/stopwords/ja | 134 ++ static/stopwords/ko | 679 +++++++++ static/stopwords/ku | 62 + static/stopwords/la | 49 + static/stopwords/lt | 474 ++++++ static/stopwords/lv | 161 ++ static/stopwords/mr | 99 ++ static/stopwords/ms | 475 ++++++ static/stopwords/nl | 413 ++++++ static/stopwords/no | 221 +++ static/stopwords/pl | 328 +++++ static/stopwords/pt | 560 +++++++ static/stopwords/ro | 434 ++++++ static/stopwords/ru | 559 +++++++ static/stopwords/sk | 221 +++ static/stopwords/sl | 446 ++++++ static/stopwords/so | 30 + static/stopwords/st | 31 + static/stopwords/sv | 418 ++++++ static/stopwords/sw | 74 + static/stopwords/th | 115 ++ static/stopwords/tl | 147 ++ static/stopwords/tr | 504 +++++++ static/stopwords/uk | 28 + static/stopwords/ur | 517 +++++++ static/stopwords/vi | 645 ++++++++ static/stopwords/yo | 60 + static/stopwords/zh | 788 ++++++++++ static/stopwords/zu | 29 + subprojects/gtest.wrap | 10 + test/cluster.cpp | 413 ++++++ test/compression.cpp | 109 ++ test/data/wikibooks_be_all_nopic_2017-02.zim | Bin 0 -> 152865 bytes ...ibooks_be_all_nopic_2017-02_splitted.zimaa | Bin 0 -> 51200 bytes ...ibooks_be_all_nopic_2017-02_splitted.zimab | Bin 0 -> 51200 bytes ...ibooks_be_all_nopic_2017-02_splitted.zimac | Bin 0 -> 50465 bytes ...ipedia_en_climate_change_nopic_2020-01.zim | Bin 0 -> 32131510 bytes test/dirent.cpp | 247 ++++ test/find.cpp | 103 ++ test/header.cpp | 97 ++ test/iterator.cpp | 125 ++ test/meson.build | 32 + test/pytest/basic_open_test.py | 121 ++ test/pytest/meson.build | 21 + test/pytest/wrapper/libzim_ext.pyx | 10 + test/pytest/wrapper/meson.build | 13 + test/pytest/wrapper/zim_wrapper.pxd | 8 + test/tempfile.cpp | 75 + test/tempfile.h | 52 + test/template.cpp | 78 + test/uuid.cpp | 118 ++ 181 files changed, 34405 insertions(+) create mode 100644 .codecov.yml create mode 100644 .github/FUNDING.yml create mode 100644 .github/move.yml create mode 100644 .github/workflows/ci.yml create mode 100644 .github/workflows/package.yml create mode 100644 .gitignore create mode 100644 AUTHORS create mode 100644 COPYING create mode 100644 ChangeLog create mode 100644 README.md create mode 100644 debian/changelog create mode 100644 debian/control create mode 100644 debian/copyright create mode 100644 debian/libzim-dev.install create mode 100644 debian/libzim6.install create mode 100755 debian/rules create mode 100644 debian/source/format create mode 100644 examples/createZimExample.cpp create mode 100644 examples/meson.build create mode 100644 include/meson.build create mode 100644 include/zim/article.h create mode 100644 include/zim/blob.h create mode 100644 include/zim/error.h create mode 100644 include/zim/file.h create mode 100644 include/zim/fileheader.h create mode 100644 include/zim/fileiterator.h create mode 100644 include/zim/search.h create mode 100644 include/zim/search_iterator.h create mode 100644 include/zim/uuid.h create mode 100644 include/zim/writer/article.h create mode 100644 include/zim/writer/creator.h create mode 100644 include/zim/writer/url.h create mode 100644 include/zim/zim.h create mode 100644 meson.build create mode 100644 meson_options.txt create mode 100755 scripts/libzim-compile-resources create mode 100644 scripts/meson.build create mode 100644 src/_dirent.h create mode 100644 src/article.cpp create mode 100644 src/blob.cpp create mode 100644 src/buffer.cpp create mode 100644 src/buffer.h create mode 100644 src/cache.h create mode 100644 src/cluster.cpp create mode 100644 src/cluster.h create mode 100644 src/compression.cpp create mode 100644 src/compression.h create mode 100644 src/config.h.in create mode 100644 src/debug.h create mode 100644 src/dirent.cpp create mode 100644 src/endian_tools.h create mode 100644 src/envvalue.cpp create mode 100644 src/envvalue.h create mode 100644 src/file.cpp create mode 100644 src/file_compound.cpp create mode 100644 src/file_compound.h create mode 100644 src/file_part.h create mode 100644 src/file_reader.cpp create mode 100644 src/file_reader.h create mode 100644 src/fileheader.cpp create mode 100644 src/fileimpl.cpp create mode 100644 src/fileimpl.h create mode 100644 src/fs.h create mode 100644 src/fs_unix.cpp create mode 100644 src/fs_unix.h create mode 100644 src/fs_windows.cpp create mode 100644 src/fs_windows.h create mode 100644 src/levenshtein.cpp create mode 100644 src/levenshtein.h create mode 100644 src/log.h create mode 100644 src/md5.c create mode 100644 src/md5.h create mode 100644 src/meson.build create mode 100644 src/search.cpp create mode 100644 src/search_internal.h create mode 100644 src/search_iterator.cpp create mode 100644 src/template.cpp create mode 100644 src/template.h create mode 100644 src/tools.cpp create mode 100644 src/tools.h create mode 100644 src/uuid.cpp create mode 100644 src/writer/_dirent.h create mode 100644 src/writer/article.cpp create mode 100644 src/writer/cluster.cpp create mode 100644 src/writer/cluster.h create mode 100644 src/writer/creator.cpp create mode 100644 src/writer/creatordata.h create mode 100644 src/writer/dirent.cpp create mode 100644 src/writer/direntPool.h create mode 100644 src/writer/queue.h create mode 100644 src/writer/workers.cpp create mode 100644 src/writer/workers.h create mode 100644 src/writer/xapianIndexer.cpp create mode 100644 src/writer/xapianIndexer.h create mode 100644 src/xapian/htmlparse.cc create mode 100644 src/xapian/htmlparse.h create mode 100644 src/xapian/myhtmlparse.cc create mode 100644 src/xapian/myhtmlparse.h create mode 100644 src/xapian/namedentities.h create mode 100644 src/zim_types.h create mode 100644 static/meson.build create mode 100644 static/resources_list.txt create mode 100644 static/stopwords/af create mode 100644 static/stopwords/ar create mode 100644 static/stopwords/bg create mode 100644 static/stopwords/bn create mode 100644 static/stopwords/br create mode 100644 static/stopwords/ca create mode 100644 static/stopwords/cs create mode 100644 static/stopwords/da create mode 100644 static/stopwords/de create mode 100644 static/stopwords/el create mode 100644 static/stopwords/en create mode 100644 static/stopwords/eo create mode 100644 static/stopwords/es create mode 100644 static/stopwords/et create mode 100644 static/stopwords/eu create mode 100644 static/stopwords/fa create mode 100644 static/stopwords/fi create mode 100644 static/stopwords/fr create mode 100644 static/stopwords/ga create mode 100644 static/stopwords/gl create mode 100644 static/stopwords/ha create mode 100644 static/stopwords/he create mode 100644 static/stopwords/hi create mode 100644 static/stopwords/hr create mode 100644 static/stopwords/hu create mode 100644 static/stopwords/hy create mode 100644 static/stopwords/id create mode 100644 static/stopwords/it create mode 100644 static/stopwords/ja create mode 100644 static/stopwords/ko create mode 100644 static/stopwords/ku create mode 100644 static/stopwords/la create mode 100644 static/stopwords/lt create mode 100644 static/stopwords/lv create mode 100644 static/stopwords/mr create mode 100644 static/stopwords/ms create mode 100644 static/stopwords/nl create mode 100644 static/stopwords/no create mode 100644 static/stopwords/pl create mode 100644 static/stopwords/pt create mode 100644 static/stopwords/ro create mode 100644 static/stopwords/ru create mode 100644 static/stopwords/sk create mode 100644 static/stopwords/sl create mode 100644 static/stopwords/so create mode 100644 static/stopwords/st create mode 100644 static/stopwords/sv create mode 100644 static/stopwords/sw create mode 100644 static/stopwords/th create mode 100644 static/stopwords/tl create mode 100644 static/stopwords/tr create mode 100644 static/stopwords/uk create mode 100644 static/stopwords/ur create mode 100644 static/stopwords/vi create mode 100644 static/stopwords/yo create mode 100644 static/stopwords/zh create mode 100644 static/stopwords/zu create mode 100644 subprojects/gtest.wrap create mode 100644 test/cluster.cpp create mode 100644 test/compression.cpp create mode 100644 test/data/wikibooks_be_all_nopic_2017-02.zim create mode 100644 test/data/wikibooks_be_all_nopic_2017-02_splitted.zimaa create mode 100644 test/data/wikibooks_be_all_nopic_2017-02_splitted.zimab create mode 100644 test/data/wikibooks_be_all_nopic_2017-02_splitted.zimac create mode 100644 test/data/wikipedia_en_climate_change_nopic_2020-01.zim create mode 100644 test/dirent.cpp create mode 100644 test/find.cpp create mode 100644 test/header.cpp create mode 100644 test/iterator.cpp create mode 100644 test/meson.build create mode 100644 test/pytest/basic_open_test.py create mode 100644 test/pytest/meson.build create mode 100644 test/pytest/wrapper/libzim_ext.pyx create mode 100644 test/pytest/wrapper/meson.build create mode 100644 test/pytest/wrapper/zim_wrapper.pxd create mode 100644 test/tempfile.cpp create mode 100644 test/tempfile.h create mode 100644 test/template.cpp create mode 100644 test/uuid.cpp diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..21288b7 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,17 @@ +codecov: + notify: + require_ci_to_pass: yes + +coverage: + status: + project: + default: + threshold: 1% + patch: + default: + target: 90% + threshold: 0% + +ignore: + - "test" + - "examples" diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..f39dc2a --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,12 @@ +# These are supported funding model platforms + +github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: # Replace with a single Liberapay username +issuehunt: # Replace with a single IssueHunt username +otechie: # Replace with a single Otechie username +custom: # https://kiwix.org/support-us/ diff --git a/.github/move.yml b/.github/move.yml new file mode 100644 index 0000000..3e1491a --- /dev/null +++ b/.github/move.yml @@ -0,0 +1,27 @@ +# Configuration for Move Issues - https://github.com/dessant/move-issues + +# Delete the command comment when it contains no other content +deleteCommand: true + +# Close the source issue after moving +closeSourceIssue: true + +# Lock the source issue after moving +lockSourceIssue: false + +# Mention issue and comment authors +mentionAuthors: true + +# Preserve mentions in the issue content +keepContentMentions: true + +# Move labels that also exist on the target repository +moveLabels: true + +# Set custom aliases for targets +# aliases: +# r: repo +# or: owner/repo + +# Repository to extend settings from +# _extends: repo \ No newline at end of file diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..4c4a860 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,156 @@ +name: CI + +on: [push] + +jobs: + Macos: + strategy: + fail-fast: false + matrix: + target: + - native_dyn + - iOS_arm64 + - iOS_i386 + - iOS_x86_64 + - iOS_armv7 + runs-on: macos-latest + steps: + - name: Checkout code + uses: actions/checkout@v1 + - name: Setup python 3.6 + uses: actions/setup-python@v1 + with: + python-version: '3.6' + - name: Install packages + uses: mstksg/get-package@v1 + with: + brew: gcovr pkg-config ninja + - name: Install python modules + run: pip3 install meson==0.52.1 pytest + - name: Install deps + shell: bash + run: | + ARCHIVE_NAME=deps2_osx_${{matrix.target}}_libzim.tar.xz + wget -O- http://tmp.kiwix.org/ci/${ARCHIVE_NAME} | tar -xJ -C $HOME + - name: Compile + shell: bash + run: | + MESON_OPTION="--default-library=shared" + MESON_CROSSFILE="$HOME/BUILD_${{matrix.target}}/meson_cross_file.txt" + if [[ ! "${{matrix.target}}" =~ native_.* ]]; then + MESON_OPTION="$MESON_OPTION -Db_bitcode=true --cross-file $MESON_CROSSFILE" + cat $MESON_CROSSFILE + fi + export PKG_CONFIG_PATH=$HOME/BUILD_${{matrix.target}}/INSTALL/lib/pkgconfig + meson . build ${MESON_OPTION} + cd build + ninja + - name: Test + if: startsWith(matrix.target, 'native_') + shell: bash + run: | + export LD_LIBRARY_PATH=$HOME/BUILD_${{matrix.target}}/INSTALL/lib:$HOME/BUILD_${{matrix.target}}/INSTALL/lib64 + cd build + meson test --verbose + env: + SKIP_BIG_MEMORY_TEST: 1 + + Linux: + strategy: + fail-fast: false + matrix: + target: + - native_static + - native_dyn + - android_arm + - android_arm64 + - win32_static + - win32_dyn + include: + - target: native_static + image_variant: xenial + lib_postfix: '/x86_64-linux-gnu' + - target: native_dyn + image_variant: xenial + lib_postfix: '/x86_64-linux-gnu' + - target: android_arm + image_variant: xenial + lib_postfix: '/x86_64-linux-gnu' + - target: android_arm64 + image_variant: xenial + lib_postfix: '/x86_64-linux-gnu' + - target: win32_static + image_variant: f31 + lib_postfix: '64' + - target: win32_dyn + image_variant: f31 + lib_postfix: '64' + env: + HOME: /home/runner + runs-on: ubuntu-latest + container: + image: "kiwix/kiwix-build_ci:${{matrix.image_variant}}-26" + steps: + - name: Extract branch name + shell: bash + run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" + id: extract_branch + - name: Checkout code + shell: python + run: | + from subprocess import check_call + from os import environ + command = [ + 'git', 'clone', + 'https://github.com/${{github.repository}}', + '--depth=1', + '--branch', '${{steps.extract_branch.outputs.branch}}' + ] + check_call(command, cwd=environ['HOME']) + - name: Install deps + shell: bash + run: | + ARCHIVE_NAME=deps2_${OS_NAME}_${{matrix.target}}_libzim.tar.xz + wget -O- http://tmp.kiwix.org/ci/${ARCHIVE_NAME} | tar -xJ -C /home/runner + - name: Compile + shell: bash + run: | + if [[ "${{matrix.target}}" =~ .*_dyn ]]; then + MESON_OPTION="--default-library=shared" + else + MESON_OPTION="--default-library=static" + fi + if [[ "${{matrix.target}}" =~ native_.* ]]; then + MESON_OPTION="$MESON_OPTION -Db_coverage=true" + else + MESON_OPTION="$MESON_OPTION --cross-file $HOME/BUILD_${{matrix.target}}/meson_cross_file.txt" + fi + if [[ "${{matrix.target}}" =~ android_.* ]]; then + MESON_OPTION="$MESON_OPTION -Dandroid=true" + fi + cd $HOME/libzim + meson . build ${MESON_OPTION} + cd build + ninja + env: + PKG_CONFIG_PATH: "/home/runner/BUILD_${{matrix.target}}/INSTALL/lib/pkgconfig:/home/runner/BUILD_${{matrix.target}}/INSTALL/lib${{matrix.lib_postfix}}/pkgconfig" + - name: Test + if: startsWith(matrix.target, 'native_') + shell: bash + run: | + cd $HOME/libzim/build + meson test --verbose + ninja coverage + env: + LD_LIBRARY_PATH: "/home/runner/BUILD_${{matrix.target}}/INSTALL/lib:/home/runner/BUILD_${{matrix.target}}/INSTALL/lib${{matrix.lib_postfix}}" + SKIP_BIG_MEMORY_TEST: 1 + - name: Publish coverage + shell: bash + run: | + cd $HOME/libzim + curl https://codecov.io/bash -o codecov.sh + bash codecov.sh -n "${OS_NAME}_${{matrix.target}}" -Z + rm codecov.sh + if: startsWith(matrix.target, 'native_') + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml new file mode 100644 index 0000000..ce13e6d --- /dev/null +++ b/.github/workflows/package.yml @@ -0,0 +1,82 @@ +name: Packages +on: [push, pull_request] + +jobs: + build-deb: + runs-on: ubuntu-latest + strategy: + matrix: + distro: [debian-unstable, debian-bullseye, debian-buster, ubuntu-groovy, ubuntu-focal, ubuntu-eoan] + steps: + - uses: actions/checkout@v2 + + - uses: legoktm/gh-action-auto-dch@master + with: + fullname: Kiwix builder + email: release+launchpad@kiwix.org + distro: ${{ matrix.distro }} + + - uses: legoktm/gh-action-build-deb@debian-unstable + if: matrix.distro == 'debian-unstable' + name: Build package for debian-unstable + id: build-debian-unstable + with: + args: --no-sign + + - uses: legoktm/gh-action-build-deb@debian-bullseye + if: matrix.distro == 'debian-bullseye' + name: Build package for debian-bullseye + id: build-debian-bullseye + with: + args: --no-sign + + - uses: legoktm/gh-action-build-deb@debian-buster + if: matrix.distro == 'debian-buster' + name: Build package for debian-buster + id: build-debian-buster + with: + args: --no-sign + + - uses: legoktm/gh-action-build-deb@ubuntu-groovy + if: matrix.distro == 'ubuntu-groovy' + name: Build package for ubuntu-groovy + id: build-ubuntu-groovy + with: + args: --no-sign + + - uses: legoktm/gh-action-build-deb@ubuntu-focal + if: matrix.distro == 'ubuntu-focal' + name: Build package for ubuntu-focal + id: build-ubuntu-focal + with: + args: --no-sign + + - uses: legoktm/gh-action-build-deb@ubuntu-eoan + if: matrix.distro == 'ubuntu-eoan' + name: Build package for ubuntu-eoan + id: build-ubuntu-eoan + with: + args: --no-sign + + - uses: actions/upload-artifact@v2 + with: + name: Packages for ${{ matrix.distro }} + path: output + + - uses: legoktm/gh-action-dput@master + name: Upload dev package + # Only upload on pushes to master + if: github.event_name == 'push' && github.event.ref == 'refs/heads/master' && startswith(matrix.distro, 'ubuntu-') + with: + gpg_key: ${{ secrets.LAUNCHPAD_GPG }} + repository: ppa:kiwixteam/dev + packages: output/*_source.changes + + - uses: legoktm/gh-action-dput@master + name: Upload release package + # Only upload on pushes to master or tag + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') && startswith(matrix.distro, 'ubuntu-') + with: + gpg_key: ${{ secrets.LAUNCHPAD_GPG }} + repository: ppa:kiwixteam/release + packages: output/*_source.changes diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1d89127 --- /dev/null +++ b/.gitignore @@ -0,0 +1,34 @@ +*~ +*#* +autom4te.cache +build +compile +config.h +configure +depcomp +.deps +.dirstamp +INSTALL +install-sh +*.kate-swp +*.la +.libs +libtool +*.lo +ltmain.sh +*.m4 +Makefile +Makefile.in +missing +*.o +stamp-h1 +.svn +.*.swp +*.zim +examples/createZimExample +src/tools/zimdump +src/tools/zimsearch +libzim.pc +test-driver +test/zimlib-test* +test/test-suite.log diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..1197f56 --- /dev/null +++ b/AUTHORS @@ -0,0 +1 @@ +Tommi Maekitalo diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..e2683b5 --- /dev/null +++ b/COPYING @@ -0,0 +1,280 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..3ed6d50 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,294 @@ +libzim 6.1.8 +============ + + * Increase default timeout for test to 120 seconds/test + * Compression algorithm to use can be passed to `zim::writer::Creator` + * Add automatic debian packaging of libzim. + * Fix using of tmpdir (and now use env var TMPDIR) during tests. + + +libzim 6.1.7 +============ + + * Do not assume urlPtrPos is just after the mimetype list. + * Fix compilation of compression test. + * Do not exit but throw an exception if an ASSERT is not fulfill. + +libzim 6.1.6 +============ + + * Better (faster) implementation of the ordering of article by cluster. + * Fix compression algorithm. + +libzim 6.1.5 +============ + + * [Writer] Remove unused declaration of classes. + Those classes were not implemented nor used at all. + +libzim 6.1.4 +============ + + * [Writer] Fix excessive memory usage. Data of the cluster were clean at the + end of the process, not once we don't need it. + +libzim 6.1.3 +============ + + * [Writer] Use a `.tmp` suffix and rename to `.zim` at the end of the write + proces. + * Add unit tests + * Do not include uncessary `windows.h` headers in public zim's headers. + +libzim 6.1.2 +============ + + * [CI] Fix codecov configuration + * [Writer] Fix threads synchronization at end of writing process. + +libzim 6.1.1 +============ + + * Fix bug around the find function + +libzim 6.1.0 +============ + + * Compile now on OpenBSD + * [Test] Use the main function provided by gtest. + * [CI] Move the CI compilation to github actions. + * Add stopwords for 54 new languages. + * [Writer] Improve the way we are writing cluster at zim creation time. + - Clusters are directly written in the zim file instead of using temporary + files. + - mimetypes are limited to 944 bytes. + * Add a new type of iterator to iterate over articles in a performant way + reducing decompression of clusters. This is now the new default iterator. + * Add support for zim files compressed with zstd compression algorithm. + This is not possible to use zstd to create zim file for now. + +libzim 6.0.2 +============ + + * Fix search suggestion parsing. + +libzim 6.0.1 +============ + + * Fix crash when trying to open an empty file. + * Ensure that pytest tests are run on the CI. + +libzim 6.0.0 +============ + + * [Writer] Index the articles in differents threads. This is a huge speed + improvement as the main thread in not blocked by indexing. + * Index the title only if `shouldIndex` return true. + +libzim 5.1.0 +============ + + * Improve indexation of the title. + * Better pertinence of suggestions (only for new zim files) + * Improvement of the speed of Leveinstein distance for suggestions (for old + zims) + +libzim 5.0.2 +============ + + * Improve README. + * Remove gtest as embeded subproject. + * Better lzma compression. + * Better performance of the leveinstein algorithm (better suggestions + performance) + +libzim 5.0.1 +============ + + * Update README. + * [Writer] Add debug information (print progress of the clusters writing). + * [Writer] Correctly print the url to the user. + * [CI] Add code coverage. + +libzim 5.0.0 +============ + + * Fix thread slipping for win32 crosscompilation. + * Fix a potential invalid access when reading dirent. + * Fix memory leak in the decompression algorithm. + * [Writer] Fix a memory leak (cluster cleanning) + * [Writer] Write article data in a temporary cluster file instead of a + temporary file per article. + * [Writer] Better algorithm to store the dirent while creating the zim + file. Better memory usage. + * [Writer] [API Change] Url/Ns are now handle using the same struct Url. + * [Writer] [API Change] No more aid and redirectAid. A redirectArticle + have to implement redirectUrl. + * [Writer] Use a memory pool to avoid multiple small memory allocations. + * [Writer] [API Change] Rename `ZimCreator` to `Creator`. + * [API Change] File's `search` and `suggestions` now return a unique_ptr + instead of a raw pointer. + +libzim 4.0.7 +============ + + * Build libzim without rpath. + +libzim 4.0.6 +============ + + * Support zim file created with cluster not written sequentially. + * Remove a meson warning. + +libzim 4.0.5 +============ + + * Store the xapian database in the right url. + * Do not fail when reading very small zim file (<256b). + * Do not print message on normal behavior. + * [BUILDSYSTEM] Be able to build a dynamic lib (libzim.so) but using static + dependencies. + * [CI] Use last version of meson. + * [CI] Use the new deps archive xz + +libzim 4.0.4 +============ + + * Fix opening of multi-part zim. + * Fix convertion of path to wpath on Windows. + +libzim 4.0.3 +============ + + * Implement low level file manipilation using different backends + +libzim 4.0.2 +============ + + * [Windows] Fix opening of zim file bigger than 4GiB + +libzim 4.0.1 +============ + + * [Writer] Fix wrong redirectyon log message + * Make libzim compile natively on windows using MSVC + * Better message when failing to read a zim file. + * Make libzim on windows correctly open unicode path. + * Add compilation option to use less memory (but more I/O). + Usefull on low memory devices (android) + * Small fixes + +libzim 4.0.0 +============ + + * [Writer] Remove a lot of memory copy. + * [Writer] Add xapian indexing directly in libzim. + * [Writer] Better API. + * [Writer] Use multi-threading to write clusters. + * [Writer] Ensure mimetype of articles article is not null. + * Extend test timeout for cluster's test. + * Less memory copy for cluster's test. + * Allow skipping test using a lot memory using env variable + `SKIP_BIG_MEMORY_TEST=1` + * Explicitly use the icu namespace to allow using of packaged icu lib. + * Use a temporary file name as long as the ZIM writting process is + not finished (#163) + * [Travis] Do no compile using gcc-5 (but the default trusty's one 4.8) + +libzim 3.3.0 +============ + + * Fix handling of big cluster (>4GiB) on 32 bits architecture. This is mainly + done by : + * Do not mmap the whole cluster by default. + * MMap only the memory asociated to an article. + * If an article is > 4GiB, the blob associated to it is invalid + (data==size==0). + * Other information are still valid (directAccessInformation, ...) + * Fix writing of extended cluster in writer. + * Compile libzim on macos. + * Build libzim setting RPATH. + * Search result urls are now what is stored in the zim file. They should not + start with a `/`. This is a revert of the change made in last release. + (See kiwix/kiwix-lib#123) + * Spelling corrections in README. + +libzim 3.2.0 +============ + + * Support geo query if the xapian database has indexed localisation. + * Handle articles bigger than 4Go in the zim file (#110). + * Use AND operator between search term. + * Fix compilation with recent clang (#95). + * Add method to get article's data localisation in the zim file. + * Be able to get only a part of article (#77). + * Do not crash if we cannot open the xapian Database for some reasons. + (kiwix/kiwix-tools#153) + * Do not assumen there is always a checksum in the zim file. + (kiwix/kiwix-tools#150) + * Try to do some sanity checks when opening a zim file. + * Use pytest to do some tests (when cython is available). + * Use levenshtein distance to sort and have better suggestion results. + * Search result urls are now always absolute (starts with a '/'). + (kiwix/kiwix-lib#110) + * Open the file readonly when checking the zim file (and so be able to check + read only file). + * Accept absolute url starting with '/' when searching for article. + * Fix various bugs + +libzim 3.1.0 +============ + + * Lzma is not a optional dependency anymore. + * Better handle (report and not crash) invalid zim file. + * Embed source of gtest (used only if gtest is not available on the system) + * Move zimDump tools out of libzim repository to zim-tools + * ZimCreator tools doesn't not read command line to set options. + +libzim 3.0.0 +============ + +This is a major change of the libzim. +Expect a lot new improvement and API changes. + + * Add a suggestion mode to the search + * Fix licensing issues + * Fix wrong stemming of the query when searching + * Deactivate searching (and so crash) in the embedded database if the zim is + splitted + * Rewrite the low level memory management of libzim when reading a zim file: + * We use a buffer base entity to handle memory and reading file instead of + reading file using stream. + * MMap the memory when posible to avoid memory copy. + * Use const when posible (API break) + * Move to googletest instead of cxxtools for unit-tests. + * Fix endiannes bug on arm. + * Do not install private headers. Those headers declare private structure and + should not be visible (API break) + * Compile libzim with `-Werror` and `-Wall` options. + * Make libzim thread safe for reading article. + The search part is not thread safe, and all search operation must be + protected by a lock. + * Add method to get only a part of a article. + * Move some tools to zim-tools repository. + + +libzim 2.0.0 +============ + + * Move to meson build system + `libzim` now use `meson` as build system instead of `autotools` + * Move to C++11 standard. + * Fulltext search in zim file. + We have integrated the xapian fulltext search in libzim. + So now, libzim provide an API to search in a zim containing embeded fulltext + index. This means that : + *libzim need xapian as (optional) dependencies (if you want compile with + xapian support). + * The old and unused search API has been removed. + * Remove bzip2 support. + * Remove Symbian support. + * Few API hanges + * Make some header files private (not installed); + * A `Blob` can now be cast to a `string` directly; + * Change a lot of `File` methods to const methods. diff --git a/README.md b/README.md new file mode 100644 index 0000000..5f2e7f2 --- /dev/null +++ b/README.md @@ -0,0 +1,147 @@ +ZIM library +=========== + +The ZIM library is the reference implementation for the ZIM file +format. It's a solution to read and write ZIM files on many systems +and architectures. More information about the ZIM format and the +openZIM project at https://openzim.org/. + +[![latest release](https://img.shields.io/github/v/tag/openzim/libzim?label=latest%20release&sort=semver)](https://download.openzim.org/release/libzim/) +[![Build Status](https://github.com/openzim/libzim/workflows/CI/badge.svg?query=branch%3Amaster)](https://github.com/openzim/libzim/actions?query=branch%3Amaster) +[![codecov](https://codecov.io/gh/openzim/libzim/branch/master/graph/badge.svg)](https://codecov.io/gh/openzim/libzim) +[![CodeFactor](https://www.codefactor.io/repository/github/openzim/libzim/badge)](https://www.codefactor.io/repository/github/openzim/libzim) +[![License: GPL v2](https://img.shields.io/badge/License-GPL%20v2-blue.svg)](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) + +Disclaimer +---------- + +This document assumes you have a little knowledge about software +compilation. If you experience difficulties with the dependencies or +with the ZIM library compilation itself, we recommend to have a look +to [kiwix-build](https://github.com/kiwix/kiwix-build). + +Preamble +-------- + +Although the ZIM library can be compiled/cross-compiled on/for many +systems, the following documentation explains how to do it on POSIX +ones. It is primarily though for GNU/Linux systems and has been tested +on recent releases of Ubuntu and Fedora. + +Dependencies +------------ + +The ZIM library relies on many third parts software libraries. They +are prerequisites to the Kiwix library compilation. Following +libraries need to be available: + +* [Z](https://zlib.net/) (package `zlib1g-dev` on Ubuntu) +* [LZMA](https://tukaani.org/lzma/) (package `liblzma-dev` on Ubuntu) +* [ICU](http://site.icu-project.org/) (package `libicu-dev` on Ubuntu) +* [Zstd](https://facebook.github.io/zstd/) (package `libzstd-dev` on Ubuntu) +* [Xapian](https://xapian.org/) - optional (package `libxapian-dev` on Ubuntu) +* [UUID](http://e2fsprogs.sourceforge.net/) (package `uuid-dev` on Ubuntu) +* [Google Test](https://github.com/google/googletest) - optional (package `googletest` on Ubuntu) + +These dependencies may or may not be packaged by your operating +system. They may also be packaged but only in an older version. The +compilation script will tell you if one of them is missing or too old. +In the worse case, you will have to download and compile a more recent +version by hand. + +If you want to install these dependencies locally, then ensure that +meson (through `pkg-config`) will properly find them. + +Environment +------------- + +The ZIM library builds using [Meson](https://mesonbuild.com/) version +0.43 or higher. Meson relies itself on Ninja, Pkg-config and few other +compilation tools. + +Install first the few common compilation tools: +* Meson +* Ninja +* Pkg-config + +These tools should be packaged if you use a cutting edge operating +system. If not, have a look to the "Troubleshooting" section. + +Compilation +----------- + +Once all dependencies are installed, you can compile ZIM library with: +```bash +meson . build +ninja -C build +``` + +By default, it will compile dynamic linked libraries. All binary files +will be created in the `build` directory created automatically by +Meson. If you want statically linked libraries, you can add +`--default-library=static` option to the Meson command. + +Depending of you system, `ninja` may be called `ninja-build`. + +Installation +------------ + +If you want to install the libzim and the headers you just have +compiled on your system, here we go: +```bash +ninja -C build install +``` + +You might need to run the command as root (or using `sudo`), depending +where you want to install the libraries. After the installation +succeeded, you may need to run ldconfig (as root). + +Uninstallation +------------ + +If you want to uninstall the libzim: +```bash +ninja -C build uninstall +``` + +Like for the installation, you might need to run the command as root +(or using `sudo`). + +Troubleshooting +--------------- + +If you need to install Meson "manually": +```bash +virtualenv -p python3 ./ # Create virtualenv +source bin/activate # Activate the virtualenv +pip3 install meson # Install Meson +hash -r # Refresh bash paths +``` + +If you need to install Ninja "manually": +```bash +git clone git://github.com/ninja-build/ninja.git +cd ninja +git checkout release +./configure.py --bootstrap +mkdir ../bin +cp ninja ../bin +cd .. +``` + +If the automated tests fail or timeout, you need to be aware that this +test suite needs up to 16GB of memory. You can skip this specific tests with: +```bash +SKIP_BIG_MEMORY_TEST=1 ninja test +``` + +If the compilation still fails, you might need to get a more recent +version of a dependency than the one packaged by your Linux +distribution. Try then with a source tarball distributed by the +problematic upstream project or even directly from the source code +repository. + +License +------- + +[GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) or later, see [COPYING](COPYING) for more details. diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..ba99cf1 --- /dev/null +++ b/debian/changelog @@ -0,0 +1,5 @@ +zimlib (0.0.0) unstable; urgency=medium + + * Initial release. + + -- Kunal Mehta Tue, 02 Jun 2020 01:49:48 -0700 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..607f71a --- /dev/null +++ b/debian/control @@ -0,0 +1,69 @@ +Source: zimlib +Section: libs +Priority: optional +Build-Depends: debhelper-compat (= 12), + liblzma-dev, + zlib1g-dev, + libicu-dev, + libxapian-dev, + libzstd-dev, + uuid-dev, + libgtest-dev, + meson, + ninja-build, + pkg-config +Maintainer: Kiwix team +Homepage: https://www.openzim.org/wiki/Zimlib +Standards-Version: 4.4.1 +Rules-Requires-Root: no + +Package: libzim6 +Architecture: any +Multi-Arch: same +Depends: ${misc:Depends}, + ${shlibs:Depends} +Pre-Depends: ${misc:Pre-Depends} +Conflicts: libzim0, libzim0v5, libzim2, libzim4, libzim5 +Replaces: libzim0, libzim0v5, libzim2, libzim4, libzim5 +Description: library implementation of ZIM specifications + ZIM (Zeno IMproved) is an open file format for storing the contents of + wiki for offline usage. This file format is primarily focused on + providing the contents of Wikipedia and Wikimedia projects for offline + use. + . + zimlib is the standard implementation of ZIM specification, which + implements the read and write method for ZIM files. + . + ZIM is a file format created with focus on extracting and encoding data + from Mediawiki for offline use. + . + Features of zimlib are: + * Native, coded in C++ + * Extremely fast + * Minimal footprint + * Minimal dependencies + * Portable on most OS (Windows, Linux, Mac OS X) + +Package: libzim-dev +Section: libdevel +Architecture: any +Depends: ${misc:Depends}, + libzim6 (= ${binary:Version}), + liblzma-dev, + zlib1g-dev, + libxapian-dev, + libicu-dev, + libzstd-dev +Description: library implementation of ZIM specifications (development) + ZIM (Zeno IMproved) is an open file format for storing the contents of + wiki for offline usage. This file format is primarily focused on + providing the contents of Wikipedia and Wikimedia projects for offline + use. + . + zimlib is the standard implementation of ZIM specification, which + implements the read and write method for ZIM files. + . + ZIM is a file format created with focus on extracting and encoding data + from Mediawiki for offline use. + . + This package contains development files. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..ff46366 --- /dev/null +++ b/debian/copyright @@ -0,0 +1 @@ +See COPYING in the repository root. diff --git a/debian/libzim-dev.install b/debian/libzim-dev.install new file mode 100644 index 0000000..1c1f0c5 --- /dev/null +++ b/debian/libzim-dev.install @@ -0,0 +1,3 @@ +usr/include/* +usr/lib/*/libzim.so +usr/lib/*/pkgconfig/* \ No newline at end of file diff --git a/debian/libzim6.install b/debian/libzim6.install new file mode 100644 index 0000000..146d0ad --- /dev/null +++ b/debian/libzim6.install @@ -0,0 +1 @@ +usr/lib/*/*.so.* \ No newline at end of file diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..1335b45 --- /dev/null +++ b/debian/rules @@ -0,0 +1,7 @@ +#!/usr/bin/make -f +export DEB_BUILD_MAINT_OPTIONS = hardening=+all + +# Skip some extremely memory-intensive tests +export SKIP_BIG_MEMORY_TEST=1 +%: + dh $@ --buildsystem=meson diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..89ae9db --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (native) diff --git a/examples/createZimExample.cpp b/examples/createZimExample.cpp new file mode 100644 index 0000000..195a140 --- /dev/null +++ b/examples/createZimExample.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2012 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include + +class TestArticle : public zim::writer::Article +{ + std::string _id; + std::string _data; + + public: + TestArticle() { } + explicit TestArticle(const std::string& id); + virtual ~TestArticle() = default; + + virtual std::string getAid() const; + virtual zim::writer::Url getUrl() const; + virtual std::string getTitle() const; + virtual bool isRedirect() const; + virtual bool shouldCompress() const { return true; } + virtual std::string getMimeType() const; + virtual zim::writer::Url getRedirectUrl() const; + virtual bool shouldIndex() const { return false; } + virtual zim::size_type getSize() const { return _data.size(); } + virtual std::string getFilename() const { return ""; } + + virtual zim::Blob getData() const + { return zim::Blob(&_data[0], _data.size()); } +}; + +TestArticle::TestArticle(const std::string& id) + : _id(id) +{ + std::ostringstream data; + data << "this is article " << id << std::endl; + _data = data.str(); +} + +std::string TestArticle::getAid() const +{ + return _id; +} + +zim::writer::Url TestArticle::getUrl() const +{ + return zim::writer::Url('A', _id); +} + +std::string TestArticle::getTitle() const +{ + return _id; +} + +bool TestArticle::isRedirect() const +{ + return false; +} + +std::string TestArticle::getMimeType() const +{ + return "text/plain"; +} + +zim::writer::Url TestArticle::getRedirectUrl() const +{ + return zim::writer::Url(); +} + +int main(int argc, char* argv[]) +{ + unsigned max = 16; + try { + zim::writer::Creator c(false, zim::zimcompZstd); + c.startZimCreation("foo.zim"); + for (unsigned n = 0; n < max; ++n) + { + std::ostringstream id; + id << (n + 1); + auto article = std::make_shared(id.str()); + c.addArticle(article); + } + c.finishZimCreation(); + } + catch (const std::exception& e) + { + std::cerr << e.what() << std::endl; + } +} + diff --git a/examples/meson.build b/examples/meson.build new file mode 100644 index 0000000..fb6b77c --- /dev/null +++ b/examples/meson.build @@ -0,0 +1,6 @@ + +executable('createZimExample', 'createZimExample.cpp', + link_with: libzim, + link_args: extra_link_args, + include_directories: include_directory, + dependencies: [thread_dep, xapian_dep, icu_dep, zlib_dep, lzma_dep]) diff --git a/include/meson.build b/include/meson.build new file mode 100644 index 0000000..6229ca8 --- /dev/null +++ b/include/meson.build @@ -0,0 +1,23 @@ +include_directory = include_directories('.') + +install_headers( + 'zim/article.h', + 'zim/blob.h', + 'zim/error.h', + 'zim/file.h', + 'zim/fileheader.h', + 'zim/fileiterator.h', + 'zim/search.h', + 'zim/search_iterator.h', + 'zim/uuid.h', + 'zim/zim.h', + subdir:'zim' +) + +install_headers( + 'zim/writer/article.h', + 'zim/writer/url.h', + 'zim/writer/creator.h', + subdir:'zim/writer' +) + diff --git a/include/zim/article.h b/include/zim/article.h new file mode 100644 index 0000000..3aa3082 --- /dev/null +++ b/include/zim/article.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ARTICLE_H +#define ZIM_ARTICLE_H + +#include +#include "zim.h" +#include "blob.h" +#include +#include + +#ifdef max +#undef max +#endif + +namespace zim +{ + class Cluster; + class Dirent; + class FileImpl; + + class Article + { + private: + std::shared_ptr file; + article_index_type idx; + + std::shared_ptr getDirent() const; + + public: + Article() + : idx(std::numeric_limits::max()) + { } + + Article(std::shared_ptr file_, article_index_type idx_) + : file(file_), + idx(idx_) + { } + + std::string getParameter() const; + + std::string getTitle() const; + std::string getUrl() const; + std::string getLongUrl() const; + + uint16_t getLibraryMimeType() const; + const std::string& getMimeType() const; + + bool isRedirect() const; + bool isLinktarget() const; + bool isDeleted() const; + + char getNamespace() const; + + article_index_type getRedirectIndex() const; + Article getRedirectArticle() const; + + size_type getArticleSize() const; + + bool operator< (const Article& a) const + { return getNamespace() < a.getNamespace() + || (getNamespace() == a.getNamespace() + && getTitle() < a.getTitle()); } + + std::shared_ptr getCluster() const; + cluster_index_type getClusterNumber() const; + + Blob getData(offset_type offset=0) const; + Blob getData(offset_type offset, size_type size) const; + + offset_type getOffset() const; + std::pair getDirectAccessInformation() const; + + std::string getPage(bool layout = true, unsigned maxRecurse = 10); + void getPage(std::ostream&, bool layout = true, unsigned maxRecurse = 10); + + article_index_type getIndex() const { return idx; } + + bool good() const { return idx != std::numeric_limits::max(); } + }; + +} + +#endif // ZIM_ARTICLE_H + diff --git a/include/zim/blob.h b/include/zim/blob.h new file mode 100644 index 0000000..928394e --- /dev/null +++ b/include/zim/blob.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_BLOB_H +#define ZIM_BLOB_H + +#include "zim.h" + +#include +#include +#include +#include + +namespace zim +{ + class Buffer; + class Blob + { + const char* _data; + size_type _size; + std::shared_ptr _buffer; + + public: + Blob(); + Blob(const char* data, size_type size); + Blob(std::shared_ptr buffer); + + operator std::string() const { return std::string(_data, _size); } + const char* data() const { return _data; } + const char* end() const { return _data + _size; } + size_type size() const { return _size; } + }; + + inline std::ostream& operator<< (std::ostream& out, const Blob& blob) + { + if (blob.data()) + out.write(blob.data(), blob.size()); + return out; + } + + inline bool operator== (const Blob& b1, const Blob& b2) + { + return b1.size() == b2.size() + && std::equal(b1.data(), b1.data() + b1.size(), b2.data()); + } +} + +#endif // ZIM_BLOB_H diff --git a/include/zim/error.h b/include/zim/error.h new file mode 100644 index 0000000..fb59e0d --- /dev/null +++ b/include/zim/error.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ERROR_H +#define ZIM_ERROR_H + +#include + +namespace zim +{ + class ZimFileFormatError : public std::runtime_error + { + public: + explicit ZimFileFormatError(const std::string& msg) + : std::runtime_error(msg) + { } + }; + +} + +#endif // ZIM_ERROR_H + diff --git a/include/zim/file.h b/include/zim/file.h new file mode 100644 index 0000000..970aaa0 --- /dev/null +++ b/include/zim/file.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2006,2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_H +#define ZIM_FILE_H + +#include +#include +#include +#include "zim.h" +#include "article.h" +#include "blob.h" +#include "fileheader.h" + +namespace zim +{ + class Search; + class FileImpl; + class Cluster; + + class File + { + std::shared_ptr impl; + + public: + File() + { } + explicit File(const std::string& fname); + + const std::string& getFilename() const; + const Fileheader& getFileheader() const; + offset_type getFilesize() const; + + article_index_type getCountArticles() const; + + Article getArticle(article_index_type idx) const; + Article getArticle(char ns, const std::string& url) const; + Article getArticleByUrl(const std::string& url) const; + Article getArticleByTitle(article_index_type idx) const; + Article getArticleByTitle(char ns, const std::string& title) const; + Article getArticleByClusterOrder(article_index_type idx) const; + + std::shared_ptr getCluster(cluster_index_type idx) const; + cluster_index_type getCountClusters() const; + offset_type getClusterOffset(cluster_index_type idx) const; + + Blob getBlob(cluster_index_type clusterIdx, blob_index_type blobIdx) const; + offset_type getOffset(cluster_index_type clusterIdx, blob_index_type blobIdx) const; + + article_index_type getNamespaceBeginOffset(char ch) const; + article_index_type getNamespaceEndOffset(char ch) const; + article_index_type getNamespaceCount(char ns) const; + + std::string getNamespaces() const; + bool hasNamespace(char ch) const; + + class const_iterator; + + const_iterator begin() const; + const_iterator beginByTitle() const; + const_iterator beginByUrl() const; + const_iterator end() const; + const_iterator findByTitle(char ns, const std::string& title) const; + const_iterator find(char ns, const std::string& url) const; + const_iterator find(const std::string& url) const; + + + std::unique_ptr search(const std::string& query, int start, int end) const; + std::unique_ptr suggestions(const std::string& query, int start, int end) const; + + time_t getMTime() const; + + const std::string& getMimeType(uint16_t idx) const; + + std::string getChecksum(); + bool verify(); + + bool is_multiPart() const; + }; + + std::string urldecode(const std::string& url); + +} + +#endif // ZIM_FILE_H + diff --git a/include/zim/fileheader.h b/include/zim/fileheader.h new file mode 100644 index 0000000..4d67809 --- /dev/null +++ b/include/zim/fileheader.h @@ -0,0 +1,123 @@ +/* + * Copyright (C) 2008 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILEHEADER_H +#define ZIM_FILEHEADER_H + +#include +#include "zim.h" +#include "uuid.h" +#include +#include + +// max may be defined as a macro by window includes +#ifdef max +#undef max +#endif + +namespace zim +{ + class Buffer; + class Fileheader + { + public: + static const uint32_t zimMagic; + static const uint16_t zimClassicMajorVersion; + static const uint16_t zimExtendedMajorVersion; + static const uint16_t zimMinorVersion; + static const size_type size; + + private: + uint16_t majorVersion; + uint16_t minorVersion; + Uuid uuid; + article_index_type articleCount; + offset_type titleIdxPos; + offset_type urlPtrPos; + offset_type mimeListPos; + cluster_index_type clusterCount; + offset_type clusterPtrPos; + article_index_type mainPage; + article_index_type layoutPage; + offset_type checksumPos; + + public: + Fileheader() + : majorVersion(zimClassicMajorVersion), + minorVersion(zimMinorVersion), + articleCount(0), + titleIdxPos(0), + urlPtrPos(0), + clusterCount(0), + clusterPtrPos(0), + mainPage(std::numeric_limits::max()), + layoutPage(std::numeric_limits::max()), + checksumPos(std::numeric_limits::max()) + {} + + void write(int out_fd) const; + void read(std::shared_ptr buffer); + + // Do some sanity check, raise a ZimFileFormateError is + // something is wrong. + void sanity_check() const; + + uint16_t getMajorVersion() const { return majorVersion; } + void setMajorVersion(uint16_t v) { majorVersion = v; } + + uint16_t getMinorVersion() const { return minorVersion; } + void setMinorVersion(uint16_t v) { minorVersion = v; } + + const Uuid& getUuid() const { return uuid; } + void setUuid(const Uuid& uuid_) { uuid = uuid_; } + + article_index_type getArticleCount() const { return articleCount; } + void setArticleCount(article_index_type s) { articleCount = s; } + + offset_type getTitleIdxPos() const { return titleIdxPos; } + void setTitleIdxPos(offset_type p) { titleIdxPos = p; } + + offset_type getUrlPtrPos() const { return urlPtrPos; } + void setUrlPtrPos(offset_type p) { urlPtrPos = p; } + + offset_type getMimeListPos() const { return mimeListPos; } + void setMimeListPos(offset_type p) { mimeListPos = p; } + + cluster_index_type getClusterCount() const { return clusterCount; } + void setClusterCount(cluster_index_type s) { clusterCount = s; } + + offset_type getClusterPtrPos() const { return clusterPtrPos; } + void setClusterPtrPos(offset_type p) { clusterPtrPos = p; } + + bool hasMainPage() const { return mainPage != std::numeric_limits::max(); } + article_index_type getMainPage() const { return mainPage; } + void setMainPage(article_index_type s){ mainPage = s; } + + bool hasLayoutPage() const { return layoutPage != std::numeric_limits::max(); } + article_index_type getLayoutPage() const { return layoutPage; } + void setLayoutPage(article_index_type s) { layoutPage = s; } + + bool hasChecksum() const { return getMimeListPos() >= 80; } + offset_type getChecksumPos() const { return hasChecksum() ? checksumPos : 0; } + void setChecksumPos(offset_type p) { checksumPos = p; } + }; + +} + +#endif // ZIM_FILEHEADER_H diff --git a/include/zim/fileiterator.h b/include/zim/fileiterator.h new file mode 100644 index 0000000..ea7943b --- /dev/null +++ b/include/zim/fileiterator.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILEITERATOR_H +#define ZIM_FILEITERATOR_H + +#include +#include "article.h" + +namespace zim +{ + class File::const_iterator : public std::iterator + { + public: + enum Mode { + UrlIterator, + ArticleIterator, + ClusterIterator + }; + + private: + const File* file; + article_index_type idx; + mutable Article article; + Mode mode; + + bool is_end() const { return file == 0 || idx >= file->getCountArticles(); } + + public: + explicit const_iterator(const File* file_, article_index_type idx_, Mode mode_) + : file(file_), + idx(idx_), + mode(mode_) + { } + + article_index_type getIndex() const { return idx; } + const File& getFile() const { return *file; } + + bool operator== (const const_iterator& it) const + { return (is_end() && it.is_end()) + || (file == it.file && idx == it.idx); } + bool operator!= (const const_iterator& it) const + { return !operator==(it); } + + const_iterator& operator++() + { + ++idx; + article = Article(); + return *this; + } + + const_iterator operator++(int) + { + const_iterator it = *this; + operator++(); + return it; + } + + const_iterator& operator--() + { + --idx; + article = Article(); + return *this; + } + + const_iterator operator--(int) + { + const_iterator it = *this; + operator--(); + return it; + } + + const Article& operator*() const + { + if (!article.good()) + { + switch(mode) + { + case UrlIterator: + article = file->getArticle(idx); + break; + case ArticleIterator: + article = file->getArticleByTitle(idx); + break; + case ClusterIterator: + article = file->getArticleByClusterOrder(idx); + break; + } + } + return article; + } + + pointer operator->() const + { + operator*(); + return &article; + } + }; +} + +#endif // ZIM_FILEITERATOR_H + diff --git a/include/zim/search.h b/include/zim/search.h new file mode 100644 index 0000000..bc8d9a5 --- /dev/null +++ b/include/zim/search.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2007 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_H +#define ZIM_SEARCH_H + +#include "search_iterator.h" +#include +#include +#include + +namespace zim +{ + +class File; +class Search +{ + friend class search_iterator; + friend struct search_iterator::InternalData; + public: + typedef search_iterator iterator; + + explicit Search(const std::vector zimfiles); + explicit Search(const File* zimfile); + Search(const Search& it); + Search& operator=(const Search& it); + Search(Search&& it); + Search& operator=(Search&& it); + ~Search(); + + void set_verbose(bool verbose); + + Search& add_zimfile(const File* zimfile); + Search& set_query(const std::string& query); + Search& set_georange(float latitude, float longitude, float distance); + Search& set_range(int start, int end); + Search& set_suggestion_mode(bool suggestion_mode); + + search_iterator begin() const; + search_iterator end() const; + int get_matches_estimated() const; + + private: + struct InternalData; + std::unique_ptr internal; + std::vector zimfiles; + + mutable std::map valuesmap; + mutable std::string prefixes; + std::string query; + float latitude; + float longitude; + float distance; + int range_start; + int range_end; + bool suggestion_mode; + bool geo_query; + mutable bool search_started; + mutable bool has_database; + mutable bool verbose; + mutable int estimated_matches_number; +}; + +} //namespace zim + +#endif // ZIM_SEARCH_H diff --git a/include/zim/search_iterator.h b/include/zim/search_iterator.h new file mode 100644 index 0000000..9a44f32 --- /dev/null +++ b/include/zim/search_iterator.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_ITERATOR_H +#define ZIM_SEARCH_ITERATOR_H + +#include +#include +#include "article.h" + +namespace zim +{ +class Search; +class search_iterator : public std::iterator +{ + friend class zim::Search; + public: + search_iterator(); + search_iterator(const search_iterator& it); + search_iterator& operator=(const search_iterator& it); + search_iterator(search_iterator&& it); + search_iterator& operator=(search_iterator&& it); + ~search_iterator(); + + bool operator== (const search_iterator& it) const; + bool operator!= (const search_iterator& it) const; + + search_iterator& operator++(); + search_iterator operator++(int); + search_iterator& operator--(); + search_iterator operator--(int); + + std::string get_url() const; + std::string get_title() const; + int get_score() const; + std::string get_snippet() const; + int get_wordCount() const; + int get_size() const; + int get_fileIndex() const; + reference operator*() const; + pointer operator->() const; + + private: + struct InternalData; + std::unique_ptr internal; + search_iterator(InternalData* internal_data); + + bool is_end() const; +}; + +} // namespace ziç + +#endif // ZIM_SEARCH_ITERATOR_H diff --git a/include/zim/uuid.h b/include/zim/uuid.h new file mode 100644 index 0000000..f86b51c --- /dev/null +++ b/include/zim/uuid.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_UUID_H +#define ZIM_UUID_H + +#include +#include +#include +#include + +namespace zim +{ + struct Uuid + { + Uuid() + { + std::memset(data, 0, 16); + } + + Uuid(const char uuid[16]) + { + std::copy(uuid, uuid+16, data); + } + + static Uuid generate(std::string value = ""); + + bool operator== (const Uuid& other) const + { return std::equal(data, data+16, other.data); } + bool operator!= (const Uuid& other) const + { return !(*this == other); } + unsigned size() const { return 16; } + + char data[16]; + }; + + std::ostream& operator<< (std::ostream& out, const Uuid& uuid); + +} + +#endif // ZIM_UUID_H diff --git a/include/zim/writer/article.h b/include/zim/writer/article.h new file mode 100644 index 0000000..f3c80fb --- /dev/null +++ b/include/zim/writer/article.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_ARTICLESOURCE_H +#define ZIM_WRITER_ARTICLESOURCE_H + +#include +#include +#include +#include +#include +#include + +namespace zim +{ + namespace writer + { + class Article + { + public: + virtual Url getUrl() const = 0; + virtual std::string getTitle() const = 0; + virtual bool isRedirect() const = 0; + virtual bool isLinktarget() const; + virtual bool isDeleted() const; + virtual std::string getMimeType() const = 0; + virtual bool shouldCompress() const = 0; + virtual bool shouldIndex() const = 0; + virtual Url getRedirectUrl() const = 0; + virtual zim::size_type getSize() const = 0; + virtual Blob getData() const = 0; + virtual std::string getFilename() const = 0; + virtual ~Article() = default; + + // returns the next category id, to which the article is assigned to + virtual std::string getNextCategory(); + }; + + } +} + +#endif // ZIM_WRITER_ARTICLESOURCE_H diff --git a/include/zim/writer/creator.h b/include/zim/writer/creator.h new file mode 100644 index 0000000..c47c621 --- /dev/null +++ b/include/zim/writer/creator.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_CREATOR_H +#define ZIM_WRITER_CREATOR_H + +#include +#include +#include + +namespace zim +{ + class Fileheader; + namespace writer + { + class CreatorData; + class Creator + { + public: + Creator(bool verbose = false, CompressionType c = zimcompLzma); + virtual ~Creator(); + + zim::size_type getMinChunkSize() const { return minChunkSize; } + void setMinChunkSize(zim::size_type s) { minChunkSize = s; } + void setIndexing(bool indexing, std::string language) + { withIndex = indexing; indexingLanguage = language; } + DEPRECATED void setCompressionThreads(unsigned ct) { nbWorkerThreads = ct; } + void setNbWorkerThreads(unsigned ct) { nbWorkerThreads = ct; } + + + virtual void startZimCreation(const std::string& fname); + virtual void addArticle(std::shared_ptr
article); + virtual void finishZimCreation(); + + virtual Url getMainUrl() const { return Url(); } + virtual Url getLayoutUrl() const { return Url(); } + virtual zim::Uuid getUuid() const { return Uuid::generate(); } + + private: + std::unique_ptr data; + bool verbose; + const CompressionType compression; + bool withIndex = false; + size_t minChunkSize = 1024-64; + std::string indexingLanguage; + unsigned nbWorkerThreads = 4; + + void fillHeader(Fileheader* header) const; + void write() const; + }; + } + +} + +#endif // ZIM_WRITER_CREATOR_H diff --git a/include/zim/writer/url.h b/include/zim/writer/url.h new file mode 100644 index 0000000..b7fa96d --- /dev/null +++ b/include/zim/writer/url.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_URL_H +#define ZIM_WRITER_URL_H + +#include + +namespace zim +{ + namespace writer + { + class Url { + public: + Url() : + url(), + ns(0) + {} + Url(char ns, std::string url) : + url(url), + ns(ns) + {} + Url(std::string url) : + url(url.substr(2)), + ns(url[0]) + {} + char getNs() const { return ns; } + const std::string& getUrl() const { return url; } + std::string getLongUrl() const { return std::string(1, ns) + '/' + url; } + bool empty() const { return ns == 0 && url.empty(); } + private: + std::string url; + char ns; + friend bool operator< (const Url& lhs, const Url& rhs); + friend bool operator== (const Url& lhs, const Url& rhs); + }; + + inline bool operator< (const Url& lhs, const Url& rhs) { + return lhs.ns < rhs.ns + || (lhs.ns == rhs.ns && lhs.url < rhs.url); + } + inline bool operator== (const Url& lhs, const Url& rhs) { + return lhs.ns == rhs.ns && lhs.url == rhs.url; + } + } +} + +#endif // ZIM_WRITER_URL_H diff --git a/include/zim/zim.h b/include/zim/zim.h new file mode 100644 index 0000000..69780ac --- /dev/null +++ b/include/zim/zim.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ZIM_H +#define ZIM_ZIM_H + +#include + +#ifdef __GNUC__ +#define DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define DEPRECATED __declspec(deprecated) +#else +#praga message("WARNING: You need to implement DEPRECATED for this compiler") +#define DEPRECATED +#endif + + +namespace zim +{ + // An index of an article (in a zim file) + typedef uint32_t article_index_type; + + // An index of an cluster (in a zim file) + typedef uint32_t cluster_index_type; + + // An index of a blog (in a cluster) + typedef uint32_t blob_index_type; + + // The size of something (article, zim, cluster, blob, ...) + typedef uint64_t size_type; + + // An offset. + typedef uint64_t offset_type; + + enum CompressionType + { + zimcompDefault, + zimcompNone, + zimcompZip, + zimcompBzip2, // Not supported anymore in the libzim + zimcompLzma, + zimcompZstd + }; + + static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate"; +} + +#endif // ZIM_ZIM_H + diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..04a1b36 --- /dev/null +++ b/meson.build @@ -0,0 +1,83 @@ +project('libzim', ['c', 'cpp'], + version : '6.1.8', + license : 'GPL2', + default_options : ['c_std=c11', 'cpp_std=c++11']) + +if build_machine.system() != 'windows' + add_project_arguments('-D_LARGEFILE64_SOURCE=1', '-D_FILE_OFFSET_BITS=64', language: 'cpp') +endif + +sizeof_off_t = meson.get_compiler('cpp').sizeof('off_t') + +conf = configuration_data() +conf.set('VERSION', '"@0@"'.format(meson.project_version())) +conf.set('DIRENT_CACHE_SIZE', get_option('DIRENT_CACHE_SIZE')) +conf.set('CLUSTER_CACHE_SIZE', get_option('CLUSTER_CACHE_SIZE')) +conf.set('LZMA_MEMORY_SIZE', get_option('LZMA_MEMORY_SIZE')) +conf.set10('MMAP_SUPPORT_64', sizeof_off_t==8) +if target_machine.system() == 'windows' + conf.set('ENABLE_USE_MMAP', false) +else + conf.set('ENABLE_USE_MMAP', get_option('USE_MMAP')) +endif +conf.set('ENABLE_USE_BUFFER_HEADER', get_option('USE_BUFFER_HEADER')) + +static_linkage = get_option('static-linkage') +static_linkage = static_linkage or get_option('default_library')=='static' + +zlib_dep = dependency('zlib', required:false, static:static_linkage) +conf.set('ENABLE_ZLIB', zlib_dep.found()) + +lzma_dep = dependency('liblzma', static:static_linkage) + +zstd_dep = dependency('libzstd', required:false, static:static_linkage) +conf.set('ENABLE_ZSTD', zstd_dep.found()) + +xapian_dep = dependency('xapian-core', + required:false, + static:static_linkage) +conf.set('ENABLE_XAPIAN', xapian_dep.found()) + +pkg_requires = ['liblzma'] +if build_machine.system() == 'windows' + thread_dep = dependency('libpthreadVC2') + pkg_requires += ['libpthreadVC2'] + extra_link_args = ['-lRpcrt4', '-lWs2_32', '-lwinmm', '-licuuc', '-licuin'] + extra_cpp_args = ['-DSORTPP_PASS'] +else + thread_dep = dependency('threads') + extra_link_args = [] + extra_cpp_args = [] +endif +if zlib_dep.found() + pkg_requires += ['zlib'] +endif +if zstd_dep.found() + pkg_requires += ['libzstd'] +endif +if xapian_dep.found() + pkg_requires += ['xapian-core'] + icu_dep = dependency('icu-i18n', static:static_linkage) + pkg_requires += ['icu-i18n'] +else + icu_dep = dependency('icu-i18n', required:false, static:static_linkage) +endif + +gtest_dep = dependency('gtest', main:true, fallback:['gtest', 'gtest_main_dep'], required:false) + +inc = include_directories('include') + +subdir('include') +subdir('scripts') +subdir('static') +subdir('src') +subdir('examples') +subdir('test') + +pkg_mod = import('pkgconfig') +pkg_mod.generate(libraries : libzim, + version : meson.project_version(), + name : 'libzim', + filebase : 'libzim', + description : 'A Library to zim.', + requires : pkg_requires) diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000..2175788 --- /dev/null +++ b/meson_options.txt @@ -0,0 +1,14 @@ +option('CLUSTER_CACHE_SIZE', type : 'string', value : '16', + description : 'set cluster cache size to number (default:16)') +option('DIRENT_CACHE_SIZE', type : 'string', value : '512', + description : 'set dirent cache size to number (default:512)') +option('LZMA_MEMORY_SIZE', type : 'string', value : '128', + description : 'set lzma uncompress memory in MB (default:128)') +option('USE_MMAP', type: 'boolean', value: true, + description: 'Use mmap to avoid copy from file. (default:true, always false on windows)') +option('USE_BUFFER_HEADER', type: 'boolean', value: true, + description: '''Copy (or use mmap) header index buffers. (default:true) +Header index are used to access articles, having them in memory can improve access speed but on low memory devices it may use to many memory. +If false, we directly read the index in the file at each article access.''') +option('static-linkage', type : 'boolean', value : false, + description : 'Link statically with the dependencies.') diff --git a/scripts/libzim-compile-resources b/scripts/libzim-compile-resources new file mode 100755 index 0000000..e4993ba --- /dev/null +++ b/scripts/libzim-compile-resources @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 + +''' +Copyright 2016 Matthieu Gautier + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or any +later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301, USA. +''' + +import argparse +import os.path +import re + +def full_identifier(filename): + parts = os.path.normpath(filename).split(os.sep) + parts = [to_identifier(part) for part in parts] + print(filename, parts) + return parts + +def to_identifier(name): + ident = re.sub(r'[^0-9a-zA-Z]', '_', name) + if ident[0].isnumeric(): + return "_"+ident + return ident + +resource_impl_template = """ +static const unsigned char {data_identifier}[] = {{ + {resource_content} +}}; + +namespace RESOURCE {{ +{namespaces_open} +const std::string {identifier} = init_resource("{env_identifier}", {data_identifier}, {resource_len}); +{namespaces_close} +}} +""" + +resource_getter_template = """ + if (name == "{common_name}") + return RESOURCE::{identifier}; +""" + +resource_decl_template = """{namespaces_open} +extern const std::string {identifier}; +{namespaces_close}""" + +class Resource: + def __init__(self, base_dirs, filename): + filename = filename.strip() + self.filename = filename + self.identifier = full_identifier(filename) + found = False + for base_dir in base_dirs: + try: + with open(os.path.join(base_dir, filename), 'rb') as f: + self.data = f.read() + found = True + break + except FileNotFoundError: + continue + if not found: + raise Exception("Impossible to found {}".format(filename)) + + def dump_impl(self): + nb_row = len(self.data)//16 + (1 if len(self.data) % 16 else 0) + sliced = (self.data[i*16:(i+1)*16] for i in range(nb_row)) + + return resource_impl_template.format( + data_identifier="_".join([""]+self.identifier), + resource_content=",\n ".join(", ".join("{:#04x}".format(i) for i in r) for r in sliced), + resource_len=len(self.data), + namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), + namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), + identifier=self.identifier[-1], + env_identifier="RES_"+"_".join(self.identifier)+"_PATH" + ) + + def dump_getter(self): + return resource_getter_template.format( + common_name=self.filename, + identifier="::".join(self.identifier) + ) + + def dump_decl(self): + return resource_decl_template.format( + namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), + namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), + identifier=self.identifier[-1] + ) + + + +master_c_template = """//This file is automaically generated. Do not modify it. + +#include +#include +#include "{include_file}" + +static std::string init_resource(const char* name, const unsigned char* content, int len) +{{ + char * resPath = getenv(name); + if (NULL == resPath) + return std::string(reinterpret_cast(content), len); + + std::ifstream ifs(resPath); + if (!ifs.good()) + return std::string(reinterpret_cast(content), len); + return std::string( (std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator() )); +}} + +const std::string& getResource_{basename}(const std::string& name) {{ +{RESOURCES_GETTER} + throw ResourceNotFound("Resource not found."); +}} + +{RESOURCES} + +""" + +def gen_c_file(resources, basename): + return master_c_template.format( + RESOURCES="\n\n".join(r.dump_impl() for r in resources), + RESOURCES_GETTER="\n\n".join(r.dump_getter() for r in resources), + include_file=basename, + basename=to_identifier(basename) + ) + + + +master_h_template = """//This file is automaically generated. Do not modify it. +#ifndef KIWIX_{BASENAME} +#define KIWIX_{BASENAME} + +#include +#include + +namespace RESOURCE {{ + {RESOURCES} +}}; + +class ResourceNotFound : public std::runtime_error {{ + public: + ResourceNotFound(const std::string& what_arg): + std::runtime_error(what_arg) + {{ }}; +}}; + +const std::string& getResource_{basename}(const std::string& name); + +#define getResource(a) (getResource_{basename}(a)) + +#endif // KIWIX_{BASENAME} + +""" + +def gen_h_file(resources, basename): + return master_h_template.format( + RESOURCES="\n ".join(r.dump_decl() for r in resources), + BASENAME=basename.upper(), + basename=basename, + ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--cxxfile', + help='The Cpp file name to generate') + parser.add_argument('--hfile', + help='The h file name to generate') + parser.add_argument('--source_dir', + help="Additional directory where to look for resources.", + action='append') + parser.add_argument('resource_file', + help='The list of resources to compile.') + args = parser.parse_args() + + base_dir = os.path.dirname(os.path.realpath(args.resource_file)) + source_dir = args.source_dir or [] + with open(args.resource_file, 'r') as f: + resources = [Resource([base_dir]+source_dir, filename) + for filename in f.readlines()] + + h_identifier = to_identifier(os.path.basename(args.hfile)) + with open(args.hfile, 'w') as f: + f.write(gen_h_file(resources, h_identifier)) + + with open(args.cxxfile, 'w') as f: + f.write(gen_c_file(resources, os.path.basename(args.hfile))) + diff --git a/scripts/meson.build b/scripts/meson.build new file mode 100644 index 0000000..e1437ae --- /dev/null +++ b/scripts/meson.build @@ -0,0 +1,2 @@ + +res_compiler = find_program('libzim-compile-resources') diff --git a/src/_dirent.h b/src/_dirent.h new file mode 100644 index 0000000..767db5c --- /dev/null +++ b/src/_dirent.h @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_DIRENT_H +#define ZIM_DIRENT_H + +#include +#include +#include +#include + +#include "zim_types.h" +#include "debug.h" + +namespace zim +{ + class Buffer; + class InvalidSize : public std::exception {}; + class Dirent + { + protected: + uint16_t mimeType; + + uint32_t version; + + cluster_index_t clusterNumber; // only used when redirect is false + blob_index_t blobNumber; // only used when redirect is false + + article_index_t redirectIndex; // only used when redirect is true + + char ns; + std::string title; + std::string url; + std::string parameter; + + public: + // these constants are put into mimeType field + static const uint16_t redirectMimeType = 0xffff; + static const uint16_t linktargetMimeType = 0xfffe; + static const uint16_t deletedMimeType = 0xfffd; + + Dirent() + : mimeType(0), + version(0), + clusterNumber(0), + blobNumber(0), + redirectIndex(0), + ns('\0') + {} + + Dirent(std::unique_ptr buffer); + + bool isRedirect() const { return mimeType == redirectMimeType; } + bool isLinktarget() const { return mimeType == linktargetMimeType; } + bool isDeleted() const { return mimeType == deletedMimeType; } + bool isArticle() const { return !isRedirect() && !isLinktarget() && !isDeleted(); } + uint16_t getMimeType() const { return mimeType; } + + uint32_t getVersion() const { return version; } + void setVersion(uint32_t v) { version = v; } + + cluster_index_t getClusterNumber() const { return isRedirect() ? cluster_index_t(0) : clusterNumber; } + blob_index_t getBlobNumber() const { return isRedirect() ? blob_index_t(0) : blobNumber; } + + article_index_t getRedirectIndex() const { return isRedirect() ? redirectIndex : article_index_t(0); } + + char getNamespace() const { return ns; } + const std::string& getTitle() const { return title.empty() ? url : title; } + const std::string& getUrl() const { return url; } + std::string getLongUrl() const; + const std::string& getParameter() const { return parameter; } + + size_t getDirentSize() const + { + size_t ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2; + if (title != url) + ret += title.size(); + return ret; + } + + void setTitle(const std::string& title_) + { + title = title_; + } + + void setUrl(char ns_, const std::string& url_) + { + ns = ns_; + url = url_; + } + + void setParameter(const std::string& parameter_) + { + parameter = parameter_; + } + + void setRedirect(article_index_t idx) + { + redirectIndex = idx; + mimeType = redirectMimeType; + } + + void setMimeType(uint16_t mime) + { + mimeType = mime; + } + + void setLinktarget() + { + ASSERT(mimeType, ==, 0); + mimeType = linktargetMimeType; + } + + void setDeleted() + { + ASSERT(mimeType, ==, 0); + mimeType = deletedMimeType; + } + + void setArticle(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_) + { + ASSERT(mimeType, ==, 0); + mimeType = mimeType_; + clusterNumber = clusterNumber_; + blobNumber = blobNumber_; + } + }; +} + +#endif // ZIM_DIRENT_H + diff --git a/src/article.cpp b/src/article.cpp new file mode 100644 index 0000000..045228f --- /dev/null +++ b/src/article.cpp @@ -0,0 +1,288 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "template.h" +#include "_dirent.h" +#include "cluster.h" +#include +#include "fileimpl.h" +#include "file_part.h" +#include +#include +#include +#include +#include "log.h" + +log_define("zim.article") + +namespace zim +{ + size_type Article::getArticleSize() const + { + auto dirent = getDirent(); + return size_type(file->getCluster(dirent->getClusterNumber()) + ->getBlobSize(dirent->getBlobNumber())); + } + + namespace + { + class Ev : public TemplateParser::Event + { + std::ostream& out; + Article& article; + std::shared_ptr file; + unsigned maxRecurse; + + public: + Ev(std::ostream& out_, Article& article_, std::shared_ptr file_, unsigned maxRecurse_) + : out(out_), + article(article_), + file(file_), + maxRecurse(maxRecurse_) + { } + void onData(const std::string& data); + void onToken(const std::string& token); + void onLink(char ns, const std::string& title); + }; + + void Ev::onData(const std::string& data) + { + out << data; + } + + void Ev::onToken(const std::string& token) + { + log_trace("onToken(\"" << token << "\")"); + + if (token == "title") + out << article.getTitle(); + else if (token == "url") + out << article.getUrl(); + else if (token == "namespace") + out << article.getNamespace(); + else if (token == "content") + { + if (maxRecurse <= 0) + throw std::runtime_error("maximum recursive limit is reached"); + article.getPage(out, false, maxRecurse - 1); + } + else + { + log_warn("unknown token \"" << token << "\" found in template"); + out << "<%" << token << "%>"; + } + } + + void Ev::onLink(char ns, const std::string& url) + { + if (maxRecurse <= 0) + throw std::runtime_error("maximum recursive limit is reached"); + std::pair r = file->findx(ns, url); + if (r.first) { + Article(file, article_index_type(r.second)).getPage(out, false, maxRecurse - 1); + } else { + throw std::runtime_error(std::string("impossible to find article ") + std::string(1, ns) + std::string("/") + url); + } + } + + } + + std::shared_ptr Article::getDirent() const + { + return file->getDirent(article_index_t(idx)); + } + + std::string Article::getParameter() const + { + return getDirent()->getParameter(); + } + + std::string Article::getTitle() const + { + return getDirent()->getTitle(); + } + + std::string Article::getUrl() const + { + return getDirent()->getUrl(); + } + + std::string Article::getLongUrl() const + { + return getDirent()->getLongUrl(); + } + + uint16_t Article::getLibraryMimeType() const + { + return getDirent()->getMimeType(); + } + + const std::string& Article::getMimeType() const + { + return file->getMimeType(getLibraryMimeType()); + } + + bool Article::isRedirect() const + { + return getDirent()->isRedirect(); + } + + bool Article::isLinktarget() const + { + return getDirent()->isLinktarget(); + } + + bool Article::isDeleted() const + { + return getDirent()->isDeleted(); + } + + char Article::getNamespace() const + { + return getDirent()->getNamespace(); + } + + article_index_type Article::getRedirectIndex() const + { + return article_index_type(getDirent()->getRedirectIndex()); + } + + Article Article::getRedirectArticle() const + { + return Article(file, getRedirectIndex()); + } + + std::shared_ptr Article::getCluster() const + { + auto dirent = getDirent(); + if ( dirent->isRedirect() + || dirent->isLinktarget() + || dirent->isDeleted() ) { + return std::shared_ptr(); + } + return file->getCluster(dirent->getClusterNumber()); + } + cluster_index_type Article::getClusterNumber() const { + auto dirent= getDirent(); + if ( dirent->isRedirect() + || dirent->isLinktarget() + || dirent->isDeleted() ) { + return std::numeric_limits::max(); + } + return dirent->getClusterNumber().v; +} + + Blob Article::getData(offset_type offset) const + { + auto size = getArticleSize()-offset; + return getData(offset, size); + } + + Blob Article::getData(offset_type offset, size_type size) const + { + std::shared_ptr cluster = getCluster(); + if (!cluster) { + return Blob(); + } + return cluster->getBlob(getDirent()->getBlobNumber(), offset_t(offset), zsize_t(size)); + } + + offset_type Article::getOffset() const + { + auto dirent = getDirent(); + if (dirent->isRedirect() + || dirent->isLinktarget() + || dirent->isDeleted()) + return 0; + return offset_type(file->getBlobOffset(dirent->getClusterNumber(), dirent->getBlobNumber())); + } + + std::pair Article::getDirectAccessInformation() const + { + auto dirent = getDirent(); + if ( dirent->isRedirect() + || dirent->isLinktarget() + || dirent->isDeleted() ) { + return std::make_pair("", 0); + } + + auto full_offset = file->getBlobOffset(dirent->getClusterNumber(), + dirent->getBlobNumber()); + + if (!full_offset) { + // cluster is compressed + return std::make_pair("", 0); + } + auto part_its = file->getFileParts(full_offset, zsize_t(getArticleSize())); + auto range = part_its.first->first; + auto part = part_its.first->second; + if (++part_its.first != part_its.second) { + return std::make_pair("", 0); + } + auto local_offset = full_offset - range.min; + return std::make_pair(part->filename(), offset_type(local_offset)); + } + + std::string Article::getPage(bool layout, unsigned maxRecurse) + { + std::ostringstream s; + getPage(s, layout, maxRecurse); + return s.str(); + } + + void Article::getPage(std::ostream& out, bool layout, unsigned maxRecurse) + { + log_trace("Article::getPage(" << layout << ", " << maxRecurse << ')'); + + if (getMimeType().compare(0, 9, "text/html") == 0 || getMimeType() == MimeHtmlTemplate) + { + if (layout && file->getFileheader().hasLayoutPage()) + { + Article layoutPage(file, file->getFileheader().getLayoutPage()); + Blob data = layoutPage.getData(); + + Ev ev(out, *this, file, maxRecurse); + log_debug("call template parser"); + TemplateParser parser(&ev); + for (const char* p = data.data(); p != data.end(); ++p) + parser.parse(*p); + parser.flush(); + + return; + } + else if (getMimeType() == MimeHtmlTemplate) + { + Blob data = getData(); + + Ev ev(out, *this, file, maxRecurse); + TemplateParser parser(&ev); + for (const char* p = data.data(); p != data.end(); ++p) + parser.parse(*p); + parser.flush(); + + return; + } + } + + // default case - template cases has return above + out << getData(); + } + +} diff --git a/src/blob.cpp b/src/blob.cpp new file mode 100644 index 0000000..fe5b82f --- /dev/null +++ b/src/blob.cpp @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include "zim/blob.h" +#include "debug.h" +#include "buffer.h" + +namespace zim { + +Blob::Blob() + : _data(0), + _size(0) +{} + +Blob::Blob(const char* data, size_type size) + : _data(data), + _size(size) +{ + ASSERT(size, <, SIZE_MAX); + ASSERT(data, <, (void*)(SIZE_MAX-size)); +} + +Blob::Blob(std::shared_ptr buffer) + : _data(buffer->data()), + _size(size_type(buffer->size())), + _buffer(buffer) +{} + + + + +} //zim diff --git a/src/buffer.cpp b/src/buffer.cpp new file mode 100644 index 0000000..a1fa84d --- /dev/null +++ b/src/buffer.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "buffer.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef _WIN32 +# include +# include +#endif + +namespace zim { + +std::shared_ptr Buffer::sub_buffer(offset_t offset, zsize_t size) const +{ + return std::make_shared(shared_from_this(), offset, size); +} + +#ifdef ENABLE_USE_MMAP +MMapBuffer::MMapBuffer(int fd, offset_t offset, zsize_t size): + Buffer(size), + _offset(0) +{ + offset_t pa_offset(offset.v & ~(sysconf(_SC_PAGE_SIZE) - 1)); + _offset = offset-pa_offset; +#if defined(__APPLE__) || defined(__OpenBSD__) + #define MAP_FLAGS MAP_PRIVATE +#else + #define MAP_FLAGS MAP_PRIVATE|MAP_POPULATE +#endif +#if !MMAP_SUPPORT_64 + if(pa_offset.v >= INT32_MAX) { + throw MMapException(); + } +#endif + _data = (char*)mmap(NULL, size.v + _offset.v, PROT_READ, MAP_FLAGS, fd, pa_offset.v); + if (_data == MAP_FAILED ) + { + std::ostringstream s; + s << "Cannot mmap size " << size.v << " at off " << offset.v << " : " << strerror(errno); + throw std::runtime_error(s.str()); + } +#undef MAP_FLAGS +} + +MMapBuffer::~MMapBuffer() +{ + munmap(_data, size_.v + _offset.v); +} + +#endif + +} //zim diff --git a/src/buffer.h b/src/buffer.h new file mode 100644 index 0000000..5d07aea --- /dev/null +++ b/src/buffer.h @@ -0,0 +1,125 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_BUFFER_H_ +#define ZIM_BUFFER_H_ + +#include +#include +#include +#include + +#include "config.h" +#include "zim_types.h" +#include "endian_tools.h" +#include "debug.h" + +namespace zim { + +class MMapException : std::exception {}; + +class Buffer : public std::enable_shared_from_this { + public: + Buffer(zsize_t size) + : size_(size) + { + ASSERT(size_.v, <, SIZE_MAX); + }; + virtual ~Buffer() {}; + virtual const char* data(offset_t offset=offset_t(0)) const = 0; + virtual char at(offset_t offset) const { + return *(data(offset)); + } + zsize_t size() const { return size_; } + virtual std::shared_ptr sub_buffer(offset_t offset, zsize_t size) const; + + template + T as(offset_t offset) const { + ASSERT(offset.v, <, size_.v); + ASSERT(offset.v+sizeof(T), <=, size_.v); + return fromLittleEndian(data(offset)); + } + + protected: + const zsize_t size_; +}; + + +template +class MemoryBuffer : public Buffer { + public: + MemoryBuffer(const char* buffer, zsize_t size) + : Buffer(size), + _data(buffer) + {} + + virtual ~MemoryBuffer() { + if ( CLEAN_AT_END ) { + delete [] _data; + } + } + + const char* data(offset_t offset) const { + ASSERT(offset.v, <=, size_.v); + return _data + offset.v; + } + private: + const char* _data; +}; + + +#ifdef ENABLE_USE_MMAP +class MMapBuffer : public Buffer { + public: + MMapBuffer(int fd, offset_t offset, zsize_t size); + ~MMapBuffer(); + + const char* data(offset_t offset) const { + offset += _offset; + return _data + offset.v; + } + + private: + offset_t _offset; + char* _data; +}; +#endif + + +class SubBuffer : public Buffer { + public: + SubBuffer(const std::shared_ptr src, offset_t offset, zsize_t size) + : Buffer(size), + _data(src, src->data(offset)) + { + ASSERT(offset.v+size.v, <=, src->size().v); + } + + const char* data(offset_t offset) const { + ASSERT(offset.v, <=, size_.v); + return _data.get() + offset.v; + } + + private: + std::shared_ptr _data; +}; + +}; + +#endif //ZIM_BUFFER_H_ diff --git a/src/cache.h b/src/cache.h new file mode 100644 index 0000000..b889fd9 --- /dev/null +++ b/src/cache.h @@ -0,0 +1,345 @@ +/* + * Copyright (C) 2008 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_CACHE_H +#define ZIM_CACHE_H + +#include +#include +#include + +namespace zim +{ + /** + Implements a container for caching elements. + + The cache holds a list of key-value-pairs. There are 2 main operations for + accessing the cache: put and get. Put takes a key and a value and puts the + element into the list. Get takes a key and optional a value. If the value + for the key is found, it is returned. The passed value otherwise. By + default the value is constructed with the empty ctor of the value-type. + + The cache has a maximum size, after which key-value-pairs are dropped, + when a new item is put into the cache. + + The algorithm for this cache is as follows: + - when the cache is not full, new elements are appended + - new elements are put into the middle of the list otherwise + - the last element of the list is then dropped + - when getting a value and the value is found, it is put to the + beginning of the list + + When elements are searched, a linear search is done using the ==-operator + of the key type. + + The caching algorithm keeps elements, which are fetched more than once in + the first half of the list. In the second half the elements are either new + or the elements are pushed from the first half to the second half by other + elements, which are found in the cache. + + You should be aware, that the key type should be simple. Comparing keys + must be cheap. Copying elements (both key and value) must be possible and + should be cheap, since they are moved in the underlying container. + + */ + template + class Cache + { + struct Data + { + bool winner; + unsigned serial; + Value value; + Data() { } + Data(bool winner_, unsigned serial_, const Value& value_) + : winner(winner_), + serial(serial_), + value(value_) + { } + }; + + typedef std::map DataType; + DataType data; + + typename DataType::size_type maxElements; + unsigned serial; + unsigned hits; + unsigned misses; + + unsigned _nextSerial() + { + if (serial == std::numeric_limits::max()) + { + for (typename DataType::iterator it = data.begin(); it != data.end(); ++it) + it->second.serial = 0; + serial = 1; + } + + return serial++; + } + + typename DataType::iterator _getOldest(bool winner) + { + typename DataType::iterator foundElement = data.begin(); + + typename DataType::iterator it = data.begin(); + + for (++it; it != data.end(); ++it) + if (it->second.winner == winner + && (foundElement->second.winner != winner || it->second.serial < foundElement->second.serial)) + foundElement = it; + + return foundElement; + } + + typename DataType::iterator _getNewest(bool winner) + { + typename DataType::iterator foundElement = data.begin(); + + typename DataType::iterator it = data.begin(); + + for (++it; it != data.end(); ++it) + if (it->second.winner == winner + && (foundElement->second.winner != winner || it->second.serial > foundElement->second.serial)) + foundElement = it; + + return foundElement; + } + + // drop one element + void _dropLooser() + { + // look for the oldest element in the list of loosers to drop it + data.erase(_getOldest(false)); + } + + void _makeLooser() + { + // look for the oldest element in the list of winners to make it a looser + typename DataType::iterator it = _getOldest(true); + it->second.winner = false; + it->second.serial = _nextSerial(); + } + + public: + typedef typename DataType::size_type size_type; + typedef Value value_type; + + explicit Cache(size_type maxElements_) + : maxElements(maxElements_ + (maxElements_ & 1)), + serial(0), + hits(0), + misses(0) + { } + + /// returns the number of elements currently in the cache + size_type size() const { return data.size(); } + + /// returns the maximum number of elements in the cache + size_type getMaxElements() const { return maxElements; } + + void setMaxElements(size_type maxElements_) + { + size_type numWinners = size() < maxElements / 2 ? size() : maxElements / 2; + + maxElements_ += (maxElements_ & 1); + + if (maxElements_ > maxElements) + { + maxElements = maxElements_; + + while (numWinners < maxElements / 2) + { + _getNewest(false)->winner = true; + ++numWinners; + } + } + else + { + while (maxElements > maxElements_) + { + _dropLooser(); + _dropLooser(); + _makeLooser(); + maxElements -= 2; + } + + while (numWinners > maxElements / 2) + { + _getNewest(true)->winner = false; + --numWinners; + } + } + } + + /// removes a element from the cache and returns true, if found + bool erase(const Key& key) + { + typename DataType::iterator it = data.find(key); + if (it == data.end()) + return false; + + if (it->second.winner) + _getNewest(false)->winner=true; + + data.erase(it); + return true; + } + + /// clears the cache. + void clear(bool stats = false) + { + data.clear(); + if (stats) + hits = misses = 0; + } + + /// puts a new element in the cache. If the element is already found in + /// the cache, it is considered a cache hit and pushed to the top of the + /// list. + void put(const Key& key, const Value& value) + { + typename DataType::iterator it; + if (data.size() < maxElements) + { + data.insert(data.begin(), + typename DataType::value_type(key, + Data(data.size() < maxElements / 2, _nextSerial(), value))); + } + else if ((it = data.find(key)) == data.end()) + { + // element not found + _dropLooser(); + data.insert(data.begin(), + typename DataType::value_type(key, + Data(false, _nextSerial(), value))); + } + else + { + // element found + it->second.serial = _nextSerial(); + if (!it->second.winner) + { + // move element to the winner part + it->second.winner = true; + _makeLooser(); + } + } + } + + /// puts a new element on the top of the cache. If the element is already + /// found in the cache, it is considered a cache hit and pushed to the + /// top of the list. This method actually overrides the need, that a element + /// needs a hit to get to the top of the cache. + void put_top(const Key& key, const Value& value) + { + typename DataType::iterator it; + if (data.size() < maxElements) + { + if (data.size() >= maxElements / 2) + _makeLooser(); + + data.insert(data.begin(), + typename DataType::value_type(key, + Data(true, _nextSerial(), value))); + } + else if ((it = data.find(key)) == data.end()) + { + // element not found + _dropLooser(); + _makeLooser(); + data.insert(data.begin(), + typename DataType::value_type(key, + Data(true, _nextSerial(), value))); + } + else + { + // element found + it->second.serial = _nextSerial(); + if (!it->second.winner) + { + // move element to the winner part + it->second.winner = true; + _makeLooser(); + } + } + } + + Value* getptr(const Key& key) + { + typename DataType::iterator it = data.find(key); + if (it == data.end()) + return 0; + + it->second.serial = _nextSerial(); + + if (!it->second.winner) + { + // move element to the winner part + it->second.winner = true; + _makeLooser(); + } + + return &it->second.value; + } + + /// returns a pair of values - a flag, if the value was found and the + /// value if found or the passed default otherwise. If the value is + /// found it is a cahce hit and pushed to the top of the list. + std::pair getx(const Key& key, Value def = Value()) + { + Value* v = getptr(key); + return v ? std::pair(true, *v) + : std::pair(false, def); + } + + /// returns the value to a key or the passed default value if not found. + /// If the value is found it is a cahce hit and pushed to the top of the + /// list. + Value get(const Key& key, Value def = Value()) + { + return getx(key, def).second; + } + + /// returns the number of hits. + unsigned getHits() const { return hits; } + /// returns the number of misses. + unsigned getMisses() const { return misses; } + /// returns the cache hit ratio between 0 and 1. + double hitRatio() const { return hits+misses > 0 ? static_cast(hits)/static_cast(hits+misses) : 0; } + /// returns the ratio, between held elements and maximum elements. + double fillfactor() const { return static_cast(data.size()) / static_cast(maxElements); } + +/* + void dump(std::ostream& out) const + { + out << "cache max size=" << maxElements << " current size=" << size() << '\n'; + for (typename DataType::const_iterator it = data.begin(); it != data.end(); ++it) + { + out << "\tkey=\"" << it->first << "\" value=\"" << it->second.value << "\" serial=" << it->second.serial << " winner=" << it->second.winner << '\n'; + } + out << "--------\n"; + } +*/ + + }; + +} + +#endif // ZIM_CACHE_H diff --git a/src/cluster.cpp b/src/cluster.cpp new file mode 100644 index 0000000..9113d6d --- /dev/null +++ b/src/cluster.cpp @@ -0,0 +1,143 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "cluster.h" +#include +#include +#include "file_reader.h" +#include "endian_tools.h" +#include +#include +#include + +#include "log.h" + +#include "config.h" + +log_define("zim.cluster") + +#define log_debug1(e) + +namespace zim +{ + Cluster::Cluster(std::shared_ptr reader_, CompressionType comp, bool isExtended) + : compression(comp), + isExtended(isExtended), + reader(reader_), + startOffset(0) + { + auto d = reader->offset(); + if (isExtended) { + startOffset = read_header(); + } else { + startOffset = read_header(); + } + reader = reader->sub_reader(startOffset); + auto d1 = reader->offset(); + ASSERT(d+startOffset, ==, d1); + } + + /* This return the number of char read */ + template + offset_t Cluster::read_header() + { + // read first offset, which specifies, how many offsets we need to read + OFFSET_TYPE offset; + offset = reader->read_uint(offset_t(0)); + + size_t n_offset = offset / sizeof(OFFSET_TYPE); + offset_t data_address(offset); + + // read offsets + offsets.clear(); + offsets.reserve(n_offset); + offsets.push_back(offset_t(0)); + + auto buffer = reader->get_buffer(offset_t(0), zsize_t(offset)); + offset_t current = offset_t(sizeof(OFFSET_TYPE)); + while (--n_offset) + { + OFFSET_TYPE new_offset = buffer->as(current); + ASSERT(new_offset, >=, offset); + ASSERT(offset, >=, data_address.v); + ASSERT(offset, <=, reader->size().v); + + offset = new_offset; + offsets.push_back(offset_t(offset - data_address.v)); + current += sizeof(OFFSET_TYPE); + } + ASSERT(offset, ==, reader->size().v); + return data_address; + } + + Blob Cluster::getBlob(blob_index_t n) const + { + if (size()) { + auto blobSize = getBlobSize(n); + if (blobSize.v > SIZE_MAX) { + return Blob(); + } + auto buffer = reader->get_buffer(offsets[blob_index_type(n)], getBlobSize(n)); + return Blob(buffer); + } else { + return Blob(); + } + } + + Blob Cluster::getBlob(blob_index_t n, offset_t offset, zsize_t size) const + { + if (this->size()) { + offset += offsets[blob_index_type(n)]; + size = std::min(size, getBlobSize(n)); + if (size.v > SIZE_MAX) { + return Blob(); + } + auto buffer = reader->get_buffer(offset, size); + return Blob(buffer); + } else { + return Blob(); + } + } + + zsize_t Cluster::size() const + { + if (isExtended) + return zsize_t(offsets.size() * sizeof(uint64_t) + reader->size().v); + else + return zsize_t(offsets.size() * sizeof(uint32_t) + reader->size().v); + } + + template + zsize_t _read_size(const Reader* reader, offset_t offset) + { + OFFSET_TYPE blob_offset = reader->read_uint(offset); + auto off = offset+offset_t(blob_offset-sizeof(OFFSET_TYPE)); + auto s = reader->read_uint(off); + return zsize_t(s); + } + + zsize_t Cluster::read_size(const Reader* reader, bool isExtended, offset_t offset) + { + if (isExtended) + return _read_size(reader, offset); + else + return _read_size(reader, offset); + } + +} diff --git a/src/cluster.h b/src/cluster.h new file mode 100644 index 0000000..c376e96 --- /dev/null +++ b/src/cluster.h @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_CLUSTER_H +#define ZIM_CLUSTER_H + +#include +#include "buffer.h" +#include "zim_types.h" +#include "file_reader.h" +#include +#include +#include + +#include "zim_types.h" + +namespace zim +{ + class Blob; + class Reader; + + class Cluster : public std::enable_shared_from_this { + typedef std::vector Offsets; + + const CompressionType compression; + const bool isExtended; + Offsets offsets; + std::shared_ptr reader; + offset_t startOffset; + + template + offset_t read_header(); + + public: + Cluster(std::shared_ptr reader, CompressionType comp, bool isExtended); + CompressionType getCompression() const { return compression; } + bool isCompressed() const { return compression != zimcompDefault && compression != zimcompNone; } + + blob_index_t count() const { return blob_index_t(offsets.size() - 1); } + zsize_t size() const; + + zsize_t getBlobSize(blob_index_t n) const { return zsize_t(offsets[blob_index_type(n)+1].v + - offsets[blob_index_type(n)].v); } + offset_t getBlobOffset(blob_index_t n) const { return startOffset + offsets[blob_index_type(n)]; } + Blob getBlob(blob_index_t n) const; + Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const; + void clear(); + + void init_from_buffer(Buffer& buffer); + static zsize_t read_size(const Reader* reader, bool isExtended, offset_t offset); + }; + +} + +#endif // ZIM_CLUSTER_H diff --git a/src/compression.cpp b/src/compression.cpp new file mode 100644 index 0000000..8f92a8e --- /dev/null +++ b/src/compression.cpp @@ -0,0 +1,224 @@ +#include "compression.h" + +#include "envvalue.h" + +#include +#include + +const std::string LZMA_INFO::name = "lzma"; +void LZMA_INFO::init_stream_decoder(stream_t* stream, char* raw_data) +{ + *stream = LZMA_STREAM_INIT; + unsigned memsize = zim::envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024); + auto errcode = lzma_stream_decoder(stream, memsize, 0); + if (errcode != LZMA_OK) { + throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream"); + } +} + +void LZMA_INFO::init_stream_encoder(stream_t* stream, char* raw_data) +{ + *stream = LZMA_STREAM_INIT; + auto errcode = lzma_easy_encoder(stream, 9 | LZMA_PRESET_EXTREME, LZMA_CHECK_CRC32); + if (errcode != LZMA_OK) { + throw std::runtime_error("Cannot initialize lzma_easy_encoder"); + } +} + +CompStatus LZMA_INFO::stream_run_encode(stream_t* stream, CompStep step) { + return stream_run(stream, step); +} + +CompStatus LZMA_INFO::stream_run_decode(stream_t* stream, CompStep step) { + return stream_run(stream, step); +} + +CompStatus LZMA_INFO::stream_run(stream_t* stream, CompStep step) +{ + auto errcode = lzma_code(stream, step==CompStep::STEP?LZMA_RUN:LZMA_FINISH); + if (errcode == LZMA_BUF_ERROR) + return CompStatus::BUF_ERROR; + if (errcode == LZMA_STREAM_END) + return CompStatus::STREAM_END; + if (errcode == LZMA_OK) + return CompStatus::OK; + return CompStatus::OTHER; +} + +void LZMA_INFO::stream_end_decode(stream_t* stream) +{ + lzma_end(stream); +} + +void LZMA_INFO::stream_end_encode(stream_t* stream) +{ + lzma_end(stream); +} + + +#if defined(ENABLE_ZLIB) +const std::string ZIP_INFO::name = "zlib"; +void ZIP_INFO::init_stream_decoder(stream_t* stream, char* raw_data) +{ + memset(stream, 0, sizeof(stream_t)); + stream->next_in = (unsigned char*) raw_data; + stream->avail_in = 1024; + auto errcode = ::inflateInit(stream); + if (errcode != Z_OK) { + throw std::runtime_error("Impossible to allocated needed memory to uncompress zlib stream"); + } +} + +void ZIP_INFO::init_stream_encoder(stream_t* stream, char* raw_data) +{ + memset(stream, 0, sizeof(z_stream)); + auto errcode = ::deflateInit(stream, Z_DEFAULT_COMPRESSION); + if (errcode != Z_OK) { + throw std::runtime_error("Impossible to allocated needed memory to uncompress zlib stream"); + } +} + +CompStatus ZIP_INFO::stream_run_decode(stream_t* stream, CompStep step) { + auto errcode = ::inflate(stream, step==CompStep::STEP?Z_SYNC_FLUSH:Z_FINISH); + if (errcode == Z_BUF_ERROR) + return CompStatus::BUF_ERROR; + if (errcode == Z_STREAM_END) + return CompStatus::STREAM_END; + if (errcode == Z_OK) + return CompStatus::OK; + return CompStatus::OTHER; +} + +CompStatus ZIP_INFO::stream_run_encode(stream_t* stream, CompStep step) { + auto errcode = ::deflate(stream, step==CompStep::STEP?Z_SYNC_FLUSH:Z_FINISH); + if (errcode == Z_BUF_ERROR) + return CompStatus::BUF_ERROR; + if (errcode == Z_STREAM_END) + return CompStatus::STREAM_END; + if (errcode == Z_OK) + return CompStatus::OK; + return CompStatus::OTHER; +} + +void ZIP_INFO::stream_end_decode(stream_t* stream) { + auto ret = ::inflateEnd(stream); + ASSERT(ret, ==, Z_OK); +} + +void ZIP_INFO::stream_end_encode(stream_t* stream) { + auto ret = ::deflateEnd(stream); + ASSERT(ret, ==, Z_OK); +} +#endif // ENABLE_ZLIB + +#if defined(ENABLE_ZSTD) +const std::string ZSTD_INFO::name = "zstd"; + +ZSTD_INFO::stream_t::stream_t() +: next_in(nullptr), + avail_in(0), + next_out(nullptr), + avail_out(0), + total_out(0), + encoder_stream(nullptr), + decoder_stream(nullptr) +{} + +ZSTD_INFO::stream_t::~stream_t() +{ + if ( encoder_stream ) + ::ZSTD_freeCStream(encoder_stream); + + if ( decoder_stream ) + ::ZSTD_freeDStream(decoder_stream); +} + +void ZSTD_INFO::init_stream_decoder(stream_t* stream, char* raw_data) +{ + stream->decoder_stream = ::ZSTD_createDStream(); + auto ret = ::ZSTD_initDStream(stream->decoder_stream); + if (::ZSTD_isError(ret)) { + throw std::runtime_error("Failed to initialize Zstd decompression"); + } +} + +void ZSTD_INFO::init_stream_encoder(stream_t* stream, char* raw_data) +{ + stream->encoder_stream = ::ZSTD_createCStream(); + auto ret = ::ZSTD_initCStream(stream->encoder_stream, ::ZSTD_maxCLevel()); + if (::ZSTD_isError(ret)) { + throw std::runtime_error("Failed to initialize Zstd compression"); + } +} + +CompStatus ZSTD_INFO::stream_run_encode(stream_t* stream, CompStep step) { + ::ZSTD_inBuffer inBuf; + inBuf.src = stream->next_in; + inBuf.size = stream->avail_in; + inBuf.pos = 0; + + ::ZSTD_outBuffer outBuf; + outBuf.dst = stream->next_out; + outBuf.size = stream->avail_out; + outBuf.pos = 0; + + auto ret = step == CompStep::STEP + ? ::ZSTD_compressStream(stream->encoder_stream, &outBuf, &inBuf) + : ::ZSTD_endStream(stream->encoder_stream, &outBuf); + stream->next_in += inBuf.pos; + stream->avail_in -= inBuf.pos; + stream->next_out += outBuf.pos; + stream->avail_out -= outBuf.pos; + stream->total_out += outBuf.pos; + + if (::ZSTD_isError(ret)) { + return CompStatus::OTHER; + } + + if ( step == CompStep::STEP ) { + if ( stream->avail_in != 0) { + ASSERT(stream->avail_out, ==, 0u); + return CompStatus::BUF_ERROR; + } + } else if ( ret > 0 ) { + return CompStatus::BUF_ERROR; + } + + return CompStatus::OK; +} + +CompStatus ZSTD_INFO::stream_run_decode(stream_t* stream, CompStep /*step*/) { + ::ZSTD_inBuffer inBuf; + inBuf.src = stream->next_in; + inBuf.size = stream->avail_in; + inBuf.pos = 0; + + ::ZSTD_outBuffer outBuf; + outBuf.dst = stream->next_out; + outBuf.size = stream->avail_out; + outBuf.pos = 0; + + auto ret = ::ZSTD_decompressStream(stream->decoder_stream, &outBuf, &inBuf); + stream->next_in += inBuf.pos; + stream->avail_in -= inBuf.pos; + stream->next_out += outBuf.pos; + stream->avail_out -= outBuf.pos; + stream->total_out += outBuf.pos; + + if (::ZSTD_isError(ret)) + return CompStatus::OTHER; + + if (ret == 0) + return CompStatus::STREAM_END; + + return CompStatus::BUF_ERROR; +} + +void ZSTD_INFO::stream_end_decode(stream_t* stream) +{ +} + +void ZSTD_INFO::stream_end_encode(stream_t* stream) +{ +} +#endif diff --git a/src/compression.h b/src/compression.h new file mode 100644 index 0000000..001be20 --- /dev/null +++ b/src/compression.h @@ -0,0 +1,285 @@ +#ifndef _LIBZIM_COMPRESSION_ +#define _LIBZIM_COMPRESSION_ + +#include +#include "string.h" + +#include "file_reader.h" +#include + +#include "config.h" + +#include +#if defined(ENABLE_ZLIB) +#include +#endif + +#if defined(ENABLE_ZSTD) +#include +#endif + + +#include "zim_types.h" + +//#define DEB(X) std::cerr << __func__ << " " << X << std::endl ; +#define DEB(X) + +enum class CompStep { + STEP, + FINISH +}; + +enum class CompStatus { + OK, + STREAM_END, + BUF_ERROR, + OTHER +}; + +enum class RunnerStatus { + OK, + NEED_MORE, + ERROR +}; + +struct LZMA_INFO { + typedef lzma_stream stream_t; + static const std::string name; + static void init_stream_decoder(stream_t* stream, char* raw_data); + static void init_stream_encoder(stream_t* stream, char* raw_data); + static CompStatus stream_run_encode(stream_t* stream, CompStep step); + static CompStatus stream_run_decode(stream_t* stream, CompStep step); + static CompStatus stream_run(stream_t* stream, CompStep step); + static void stream_end_encode(stream_t* stream); + static void stream_end_decode(stream_t* stream); +}; + + +#if defined(ENABLE_ZLIB) +struct ZIP_INFO { + typedef z_stream stream_t; + static const std::string name; + static void init_stream_decoder(stream_t* stream, char* raw_data); + static void init_stream_encoder(stream_t* stream, char* raw_data); + static CompStatus stream_run_encode(stream_t* stream, CompStep step); + static CompStatus stream_run_decode(stream_t* stream, CompStep step); + static void stream_end_encode(stream_t* stream); + static void stream_end_decode(stream_t* stream); +}; +#endif + +#if defined(ENABLE_ZSTD) +struct ZSTD_INFO { + struct stream_t + { + const unsigned char* next_in; + size_t avail_in; + unsigned char* next_out; + size_t avail_out; + size_t total_out; + + ::ZSTD_CStream* encoder_stream; + ::ZSTD_DStream* decoder_stream; + + stream_t(); + ~stream_t(); + private: + stream_t(const stream_t& t) = delete; + void operator=(const stream_t& t) = delete; + }; + + static const std::string name; + static void init_stream_decoder(stream_t* stream, char* raw_data); + static void init_stream_encoder(stream_t* stream, char* raw_data); + static CompStatus stream_run_encode(stream_t* stream, CompStep step); + static CompStatus stream_run_decode(stream_t* stream, CompStep step); + static void stream_end_encode(stream_t* stream); + static void stream_end_decode(stream_t* stream); +}; + +#endif + + +namespace zim { + +template +class Uncompressor +{ + public: + Uncompressor(size_t initial_size=1024*1024) : + ret_data(new char[initial_size]), + data_size(initial_size) + {} + ~Uncompressor() = default; + + void init(char* data) { + INFO::init_stream_decoder(&stream, data); + stream.next_out = (uint8_t*)ret_data.get(); + stream.avail_out = data_size; + } + + RunnerStatus feed(char* data, size_t size, CompStep step = CompStep::STEP) { + stream.next_in = (unsigned char*)data; + stream.avail_in = size; + auto errcode = CompStatus::OTHER; + while (true) { + errcode = INFO::stream_run_decode(&stream, step); + DEB((int)errcode) + if (errcode == CompStatus::BUF_ERROR) { + if (stream.avail_in == 0 && stream.avail_out != 0) { + // End of input stream. + // compressor hasn't recognize the end of the input stream but there is + // no more input. + return RunnerStatus::NEED_MORE; + } else { + //Not enought output size + DEB("need memory " << data_size << " " << stream.avail_out << " " << stream.total_out) + data_size *= 2; + std::unique_ptr new_ret_data(new char[data_size]); + memcpy(new_ret_data.get(), ret_data.get(), stream.total_out); + stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out); + stream.avail_out = data_size - stream.total_out; + DEB(data_size << " " << stream.avail_out << " " << stream.avail_in) + ret_data = std::move(new_ret_data); + continue; + } + } + if (errcode == CompStatus::STREAM_END) + break; + // On first call where lzma cannot progress (no output size). + // Lzma return OK. If we return NEED_MORE, then we will try to compress + // with new input data, but we should not as current one is not processed. + // We must do a second step to have te BUF_ERROR and handle thing correctly. + if (errcode == CompStatus::OK) { + if (stream.avail_in == 0) + break; + continue; + } + return RunnerStatus::ERROR; + }; + return errcode==CompStatus::STREAM_END?RunnerStatus::OK:RunnerStatus::NEED_MORE; + } + + std::unique_ptr get_data(zim::zsize_t* size) { + feed(nullptr, 0, CompStep::FINISH); + size->v = stream.total_out; + INFO::stream_end_decode(&stream); + return std::move(ret_data); + } + + private: + std::unique_ptr ret_data; + size_type data_size; + typename INFO::stream_t stream; +}; + +#define CHUNCK_SIZE ((zim::size_type)(1024)) +/** + * Uncompress data of the reader at startOffset. + * + * @param reader The reader where the data is. + * @param startOffset The offset where the data is in the reader. + * @param dest_size[out] The size of the uncompressed data. + * @return A pointer to the uncompressed data. This must be deleted (delete[]) +*/ +template +std::unique_ptr uncompress(const zim::Reader* reader, zim::offset_t startOffset, zim::zsize_t* dest_size) { + // Use a compressor to compress the data. + // As we don't know the result size, neither the compressed size, + // we have to do chunk by chunk until decompressor is happy. + // Let's assume it will be something like the minChunkSize used at creation + Uncompressor runner(1024*1024); + // The input is a buffer of CHUNCK_SIZE char max. It may be less if the last chunk + // is at the end of the reader and the reader size is not a multiple of CHUNCK_SIZE. + std::vector raw_data(CHUNCK_SIZE); + + DEB("Init") + runner.init(raw_data.data()); + + zim::size_type availableSize = reader->size().v - startOffset.v; + auto ret = RunnerStatus::NEED_MORE; + while(ret != RunnerStatus::OK) { + if (ret == RunnerStatus::NEED_MORE and availableSize) { + zim::size_type inputSize = std::min(availableSize, CHUNCK_SIZE); + reader->read(raw_data.data(), startOffset, zim::zsize_t(inputSize)); + startOffset.v += inputSize; + availableSize -= inputSize; + DEB("Step " << startOffset.v) + ret = runner.feed(raw_data.data(), inputSize); + DEB("Ret " << (int)ret) + } + if (ret == RunnerStatus::ERROR) { + throw zim::ZimFileFormatError(std::string("Invalid ") + INFO::name + + std::string(" stream for cluster.")); + } + } + + DEB("Finish") + return runner.get_data(dest_size); +} + +template +class Compressor +{ + public: + Compressor(size_t initial_size=1024*1024) : + ret_data(new char[initial_size]), + ret_size(initial_size) + {} + + ~Compressor() = default; + + void init(char* data) { + INFO::init_stream_encoder(&stream, data); + stream.next_out = (uint8_t*)ret_data.get(); + stream.avail_out = ret_size; + } + + RunnerStatus feed(const char* data, size_t size, CompStep step=CompStep::STEP) { + stream.next_in = (unsigned char*)data; + stream.avail_in = size; + auto errcode = CompStatus::OTHER; + while (true) { + errcode = INFO::stream_run_encode(&stream, step); + if (stream.avail_out == 0) { + if (errcode == CompStatus::OK) { + // lzma return a OK return status the first time it runs out of output memory. + // The BUF_ERROR is returned only the second time we call a lzma_code. + continue; + } + if (errcode == CompStatus::BUF_ERROR) { + //Not enought output size + ret_size *= 2; + std::unique_ptr new_ret_data(new char[ret_size]); + memcpy(new_ret_data.get(), ret_data.get(), stream.total_out); + stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out); + stream.avail_out = ret_size - stream.total_out; + ret_data = std::move(new_ret_data); + continue; + } + } + if (errcode == CompStatus::STREAM_END || errcode == CompStatus::OK) { + // Everything ok, quit the loop + break; + } + return RunnerStatus::ERROR; + }; + return RunnerStatus::NEED_MORE; + } + + std::unique_ptr get_data(zim::zsize_t* size) { + feed(nullptr, 0, CompStep::FINISH); + INFO::stream_end_encode(&stream); + size->v = stream.total_out; + return std::move(ret_data); + } + + private: + std::unique_ptr ret_data; + size_t ret_size; + typename INFO::stream_t stream; +}; + +} // namespace zim + +#endif // _LIBZIM_COMPRESSION_ diff --git a/src/config.h.in b/src/config.h.in new file mode 100644 index 0000000..78ab74b --- /dev/null +++ b/src/config.h.in @@ -0,0 +1,20 @@ + +#mesondefine VERSION + +#mesondefine DIRENT_CACHE_SIZE + +#mesondefine CLUSTER_CACHE_SIZE + +#mesondefine LZMA_MEMORY_SIZE + +#mesondefine ENABLE_ZLIB + +#mesondefine ENABLE_ZSTD + +#mesondefine ENABLE_XAPIAN + +#mesondefine ENABLE_USE_MMAP + +#mesondefine ENABLE_USE_BUFFER_HEADER + +#mesondefine MMAP_SUPPORT_64 diff --git a/src/debug.h b/src/debug.h new file mode 100644 index 0000000..a73dd16 --- /dev/null +++ b/src/debug.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef DEBUG_H_ +#define DEBUG_H_ + +#include +#include +#include + +#if defined (NDEBUG) +# define ASSERT(left, operator, right) (void(0)) +#else + +#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) +#include +#endif + +template +void _on_assert_fail(const char* vara, const char* op, const char* varb, + T a, U b, const char* file, int line) { + std::ostringstream ss; + ss << "\nAssertion failed at "<< file << ":" << line << "\n " << + vara << "[" << a << "] " << op << " " << varb << "[" << b << "]"; + std::cerr << ss.str() << std::endl; + +#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) + void *callstack[64]; + size_t size; + size = backtrace(callstack, 64); + char** strings = backtrace_symbols(callstack, size); + for (size_t i=0; i +#include "buffer.h" +#include "endian_tools.h" +#include "log.h" +#include +#include + +log_define("zim.dirent") + +namespace zim +{ + ////////////////////////////////////////////////////////////////////// + // Dirent + // + + const uint16_t Dirent::redirectMimeType; + const uint16_t Dirent::linktargetMimeType; + const uint16_t Dirent::deletedMimeType; + + Dirent::Dirent(std::unique_ptr buffer) + : Dirent() + { + uint16_t mimeType = buffer->as(offset_t(0)); + bool redirect = (mimeType == Dirent::redirectMimeType); + bool linktarget = (mimeType == Dirent::linktargetMimeType); + bool deleted = (mimeType == Dirent::deletedMimeType); + uint8_t extraLen = buffer->data()[2]; + char ns = buffer->data()[3]; + uint32_t version = buffer->as(offset_t(4)); + setVersion(version); + + offset_t current = offset_t(8); + + if (redirect) + { + article_index_t redirectIndex(buffer->as(current)); + current += sizeof(article_index_t); + + log_debug("redirectIndex=" << redirectIndex); + + setRedirect(article_index_t(redirectIndex)); + } + else if (linktarget || deleted) + { + log_debug("linktarget or deleted entry"); + setArticle(mimeType, cluster_index_t(0), blob_index_t(0)); + } + else + { + log_debug("read article entry"); + + uint32_t clusterNumber = buffer->as(current); + current += sizeof(uint32_t); + uint32_t blobNumber = buffer->as(current); + current += sizeof(uint32_t); + + log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber); + + setArticle(mimeType, cluster_index_t(clusterNumber), blob_index_t(blobNumber)); + } + + std::string url; + std::string title; + std::string parameter; + + log_debug("read url, title and parameters"); + + offset_type url_size = strnlen( + buffer->data(current), + buffer->size().v - current.v - extraLen + ); + if (current.v + url_size >= buffer->size().v) { + throw(InvalidSize()); + } + url = std::string(buffer->data(current), url_size); + current += url_size + 1; + + offset_type title_size = strnlen( + buffer->data(current), + buffer->size().v - current.v - extraLen + ); + if (current.v + title_size >= buffer->size().v) { + throw(InvalidSize()); + } + title = std::string(buffer->data(current), title_size); + current += title_size + 1; + + if (current.v + extraLen > buffer->size().v) { + throw(InvalidSize()); + } + parameter = std::string(buffer->data(current), extraLen); + + setUrl(ns, url); + setTitle(title); + setParameter(parameter); + } + + std::string Dirent::getLongUrl() const + { + log_trace("Dirent::getLongUrl()"); + log_debug("namespace=" << getNamespace() << " title=" << getTitle()); + + return std::string(1, getNamespace()) + '/' + getUrl(); + } + +} diff --git a/src/endian_tools.h b/src/endian_tools.h new file mode 100644 index 0000000..9bf6bf7 --- /dev/null +++ b/src/endian_tools.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ENDIAN_H +#define ENDIAN_H + +#include +#include +#include + +namespace zim +{ + +template +struct ToLittleEndianImpl; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint16_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + } +}; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint32_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + dst[2] = static_cast(v>>16); + dst[3] = static_cast(v>>24); +} +}; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint64_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + dst[2] = static_cast(v>>16); + dst[3] = static_cast(v>>24); + dst[4] = static_cast(v>>32); + dst[5] = static_cast(v>>40); + dst[6] = static_cast(v>>48); + dst[7] = static_cast(v>>56); + } +}; + +//////////////////////////////////////////////////////////////////////// +template +inline void toLittleEndian(T d, char* dst) +{ + ToLittleEndianImpl::write(d, dst); +} + +template +inline T fromLittleEndian(const char* ptr) +{ + T ret = 0; + for(size_t i=0; i(static_cast(ptr[i])) << (i*8)); + } + return ret; +} + +} + +#endif // ENDIAN_H + diff --git a/src/envvalue.cpp b/src/envvalue.cpp new file mode 100644 index 0000000..1d5c64f --- /dev/null +++ b/src/envvalue.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +namespace zim +{ + unsigned envValue(const char* env, unsigned def) + { + const char* v = ::getenv(env); + if (v) + { + std::istringstream s(v); + s >> def; + } + return def; + } + + unsigned envMemSize(const char* env, unsigned def) + { + const char* v = ::getenv(env); + if (v) + { + char unit = '\0'; + std::istringstream s(v); + s >> def >> unit; + + switch (unit) + { + case 'k': + case 'K': def *= 1024; break; + case 'm': + case 'M': def *= 1024 * 1024; break; + case 'g': + case 'G': def *= 1024 * 1024 * 1024; break; + } + } + return def; + } +} + diff --git a/src/envvalue.h b/src/envvalue.h new file mode 100644 index 0000000..d6dffd4 --- /dev/null +++ b/src/envvalue.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ENVVALUE_H +#define ZIM_ENVVALUE_H + +namespace zim +{ + unsigned envValue(const char* env, unsigned def); + unsigned envMemSize(const char* env, unsigned def); +} + +#endif // ZIM_ENVVALUE_H diff --git a/src/file.cpp b/src/file.cpp new file mode 100644 index 0000000..db9b3d3 --- /dev/null +++ b/src/file.cpp @@ -0,0 +1,312 @@ +/* + * Copyright (C) 2006,2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include "fileimpl.h" +#include +#include +#include "log.h" +#include +#include + +log_define("zim.file") + +namespace zim +{ + namespace + { + int hexval(char ch) + { + if (ch >= '0' && ch <= '9') + return ch - '0'; + if (ch >= 'a' && ch <= 'f') + return ch - 'a' + 10; + if (ch >= 'A' && ch <= 'F') + return ch - 'A' + 10; + return -1; + } + } + + File::File(const std::string& fname) + : impl(new FileImpl(fname)) + { } + + const std::string& File::getFilename() const + { + return impl->getFilename(); + } + + const Fileheader& File::getFileheader() const + { + return impl->getFileheader(); + } + + size_type File::getFilesize() const + { + return impl->getFilesize().v; + } + + article_index_type File::getCountArticles() const + { + return article_index_type(impl->getCountArticles()); + } + + Article File::getArticle(article_index_type idx) const + { + if (idx >= article_index_type(impl->getCountArticles())) + throw ZimFileFormatError("article index out of range"); + return Article(impl, idx); + } + + Article File::getArticle(char ns, const std::string& url) const + { + log_trace("File::getArticle('" << ns << "', \"" << url << ')'); + std::pair r = impl->findx(ns, url); + return r.first ? Article(impl, article_index_type(r.second)) : Article(); + } + + Article File::getArticleByUrl(const std::string& url) const + { + log_trace("File::getArticle(\"" << url << ')'); + std::pair r = impl->findx(url); + return r.first ? Article(impl, article_index_type(r.second)) : Article(); + } + + Article File::getArticleByTitle(article_index_type idx) const + { + return Article(impl, article_index_type(impl->getIndexByTitle(article_index_t(idx)))); + } + + Article File::getArticleByTitle(char ns, const std::string& title) const + { + log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')'); + std::pair r = impl->findxByTitle(ns, title); + return r.first + ? Article(impl, article_index_type(impl->getIndexByTitle(r.second))) + : Article(); + } + + Article File::getArticleByClusterOrder(article_index_type idx) const + { + auto res = impl->findxByClusterOrder(idx); + + if (res.first) + return Article(impl, res.second.v); + else + return Article(); + } + + std::shared_ptr File::getCluster(cluster_index_type idx) const + { + return impl->getCluster(cluster_index_t(idx)); + } + + cluster_index_type File::getCountClusters() const + { + return cluster_index_type(impl->getCountClusters()); + } + + offset_type File::getClusterOffset(cluster_index_type idx) const + { + return offset_type(impl->getClusterOffset(cluster_index_t(idx))); + } + + Blob File::getBlob(cluster_index_type clusterIdx, blob_index_type blobIdx) const + { + return impl->getCluster(cluster_index_t(clusterIdx))->getBlob(blob_index_t(blobIdx)); + } + + article_index_type File::getNamespaceBeginOffset(char ch) const + { + return article_index_type(impl->getNamespaceBeginOffset(ch)); + } + + article_index_type File::getNamespaceEndOffset(char ch) const + { + return article_index_type(impl->getNamespaceEndOffset(ch)); + } + + article_index_type File::getNamespaceCount(char ns) const + { + return getNamespaceEndOffset(ns) - getNamespaceBeginOffset(ns); + } + + std::string File::getNamespaces() const + { + return impl->getNamespaces(); + } + + bool File::hasNamespace(char ch) const + { + article_index_t off = impl->getNamespaceBeginOffset(ch); + return off < impl->getCountArticles() && impl->getDirent(off)->getNamespace() == ch; + } + + File::const_iterator File::begin() const + { return const_iterator(this, 0, const_iterator::ClusterIterator); } + + File::const_iterator File::beginByTitle() const + { return const_iterator(this, 0, const_iterator::ArticleIterator); } + + File::const_iterator File::beginByUrl() const + { return const_iterator(this, 0, const_iterator::UrlIterator); } + + File::const_iterator File::end() const + { return const_iterator(this, getCountArticles(), const_iterator::UrlIterator); } + + File::const_iterator File::find(char ns, const std::string& url) const + { + std::pair r = impl->findx(ns, url); + return File::const_iterator(this, article_index_type(r.second), const_iterator::UrlIterator); + } + + File::const_iterator File::find(const std::string& url) const + { + std::pair r = impl->findx(url); + return File::const_iterator(this, article_index_type(r.second), const_iterator::UrlIterator); + } + + File::const_iterator File::findByTitle(char ns, const std::string& title) const + { + std::pair r = impl->findxByTitle(ns, title); + return File::const_iterator(this, article_index_type(r.second), const_iterator::ArticleIterator); + } + + std::unique_ptr File::search(const std::string& query, int start, int end) const { + auto search = std::unique_ptr(new Search(this)); + search->set_query(query); + search->set_range(start, end); + return search; + } + + std::unique_ptr File::suggestions(const std::string& query, int start, int end) const { + auto search = std::unique_ptr(new Search(this)); + search->set_query(query); + search->set_range(start, end); + search->set_suggestion_mode(true); + return search; + } + + offset_type File::getOffset(cluster_index_type clusterIdx, blob_index_type blobIdx) const + { + return offset_type(impl->getBlobOffset( + cluster_index_t(clusterIdx), + blob_index_t(blobIdx))); + } + + time_t File::getMTime() const + { + return impl->getMTime(); + } + + const std::string& File::getMimeType(uint16_t idx) const + { + return impl->getMimeType(idx); + } + + std::string File::getChecksum() + { + return impl->getChecksum(); + } + + bool File::verify() + { + return impl->verify(); + } + + bool File::is_multiPart() const + { + return impl->is_multiPart(); + } + + + std::string urldecode(const std::string& url) + { + std::string ret; + enum { + state_0, + state_h1, + state_h2 + } state = state_0; + + char ch = '\0'; + for (std::string::const_iterator it = url.begin(); it != url.end(); ++it) + { + switch (state) + { + case state_0: + if (*it == '+') + ret += ' '; + else if (*it == '%') + state = state_h1; + else + ret += *it; + break; + + case state_h1: + if ( (*it >= '0' && *it <= '9') + || (*it >= 'A' && *it <= 'F') + || (*it >= 'a' && *it <= 'f')) + { + ch = *it; + state = state_h2; + } + else + { + ret += '%'; + ret += *it; + state = state_0; + } + break; + + case state_h2: + if ( (*it >= '0' && *it <= '9') + || (*it >= 'A' && *it <= 'F') + || (*it >= 'a' && *it <= 'f')) + { + ret += static_cast(hexval(ch) * 16 + hexval(*it)); + } + else + { + ret += static_cast(hexval(ch)); + ret += *it; + } + state = state_0; + break; + } + } + + switch (state) + { + case state_0: + break; + + case state_h1: + ret += '%'; + break; + + case state_h2: + ret += '%'; + ret += ch; + break; + } + + return ret; + } +} diff --git a/src/file_compound.cpp b/src/file_compound.cpp new file mode 100644 index 0000000..6d52639 --- /dev/null +++ b/src/file_compound.cpp @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "file_compound.h" +#include "buffer.h" + +#include +#include +#include +#include + +#ifdef _WIN32 +# include +#else +# include +#endif + +namespace zim { + +FileCompound::FileCompound(const std::string& filename): + _fsize(0) +{ + try { + auto part = new FilePart<>(filename); + emplace(Range(offset_t(0), offset_t(part->size().v)), part); + _fsize = part->size(); + } catch(...) { + int errnoSave = errno; + _fsize = zsize_t(0); + for (char ch0 = 'a'; ch0 <= 'z'; ++ch0) + { + std::string fname0 = filename + ch0; + for (char ch1 = 'a'; ch1 <= 'z'; ++ch1) + { + std::string fname1 = fname0 + ch1; + + try { + auto currentPart = new FilePart<>(fname1); + emplace(Range(offset_t(_fsize.v), offset_t((_fsize+currentPart->size()).v)), currentPart); + _fsize += currentPart->size(); + } catch (...) { + break; + } + } + } + + if (empty()) + { + std::ostringstream msg; + msg << "error " << errnoSave << " opening file \"" << filename; + throw std::runtime_error(msg.str()); + } + } +} + +FileCompound::FileCompound(FilePart<>* filePart): + _fsize(0) +{ + emplace(Range(offset_t(0), offset_t(filePart->size().v)), filePart); + _fsize = filePart->size(); +} + +FileCompound::~FileCompound() { + for(auto it=begin(); it!=end(); it++) { + auto filepart = it->second; + delete filepart; + } +} + +time_t FileCompound::getMTime() const { + if (mtime || empty()) + return mtime; + + const char* fname = begin()->second->filename().c_str(); + + #if defined(HAVE_STAT64) && ! defined(__APPLE__) + struct stat64 st; + int ret = ::stat64(fname, &st); + #else + struct stat st; + int ret = ::stat(fname, &st); + #endif + if (ret != 0) + { + std::ostringstream msg; + msg << "stat failed with errno " << errno << " : " << strerror(errno); + throw std::runtime_error(msg.str()); + } + mtime = st.st_mtime; + + return mtime; + +} + +} // zim diff --git a/src/file_compound.h b/src/file_compound.h new file mode 100644 index 0000000..a6b7490 --- /dev/null +++ b/src/file_compound.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_COMPOUND_H_ +#define ZIM_FILE_COMPOUND_H_ + +#include "file_part.h" +#include "zim_types.h" +#include +#include +#include + +namespace zim { + +class FileReader; + +struct Range { + Range(const offset_t point ) : min(point), max(point) {} + Range(const offset_t min, const offset_t max) : min(min), max(max) {} + const offset_t min; + const offset_t max; +}; + +struct less_range : public std::binary_function< Range, Range, bool> +{ + bool operator()(const Range& lhs, const Range& rhs) const { + return lhs.min < rhs.min && lhs.max <= rhs.min; + } +}; + +class FileCompound : public std::map*, less_range> { + public: + FileCompound(const std::string& filename); + FileCompound(FilePart<>* fpart); + ~FileCompound(); + + zsize_t fsize() const { return _fsize; }; + time_t getMTime() const; + bool fail() const { return empty(); }; + bool is_multiPart() const { return size() > 1; }; + + std::pair + locate(offset_t offset, zsize_t size) const { + return equal_range(Range(offset, offset+size)); + } + + private: + zsize_t _fsize; + mutable time_t mtime; +}; + + +}; + + +#endif //ZIM_FILE_COMPOUND_H_ diff --git a/src/file_part.h b/src/file_part.h new file mode 100644 index 0000000..3867d29 --- /dev/null +++ b/src/file_part.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_PART_H_ +#define ZIM_FILE_PART_H_ + +#include +#include + +#include + +#include "zim_types.h" +#include "fs.h" + +namespace zim { + +template +class FilePart { + public: + FilePart(const std::string& filename) : + m_filename(filename), + m_fhandle(FS::openFile(filename)), + m_size(m_fhandle.getSize()) {} + FilePart(int fd) : + m_filename(""), + m_fhandle(fd), + m_size(m_fhandle.getSize()) {} + ~FilePart() = default; + const std::string& filename() const { return m_filename; }; + const typename FS::FD& fhandle() const { return m_fhandle; }; + + zsize_t size() const { return m_size; }; + bool fail() const { return !m_size; }; + bool good() const { return bool(m_size); }; + + private: + const std::string m_filename; + typename FS::FD m_fhandle; + zsize_t m_size; +}; + +}; + +#endif //ZIM_FILE_PART_H_ diff --git a/src/file_reader.cpp b/src/file_reader.cpp new file mode 100644 index 0000000..64eb5d5 --- /dev/null +++ b/src/file_reader.cpp @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include "file_reader.h" +#include "file_compound.h" +#include "cluster.h" +#include "buffer.h" +#include "compression.h" +#include +#include +#include +#include +#include +#include +#include +#include + + +#if defined(_MSC_VER) +# include +# include + typedef SSIZE_T ssize_t; +#endif + +namespace zim { + +FileReader::FileReader(std::shared_ptr source) + : FileReader(source, offset_t(0), source->fsize()) {} + +FileReader::FileReader(std::shared_ptr source, offset_t offset) + : FileReader(source, offset, zsize_t(source->fsize().v-offset.v)) {} + +FileReader::FileReader(std::shared_ptr source, offset_t offset, zsize_t size) + : source(source), + _offset(offset), + _size(size) +{ + ASSERT(offset.v, <=, source->fsize().v); + ASSERT(offset.v+size.v, <=, source->fsize().v); +} + +char FileReader::read(offset_t offset) const { + ASSERT(offset.v, <, _size.v); + offset += _offset; + auto part_pair = source->lower_bound(offset); + auto& fhandle = part_pair->second->fhandle(); + offset_t local_offset = offset - part_pair->first.min; + ASSERT(local_offset, <=, part_pair->first.max); + char ret; + try { + fhandle.readAt(&ret, zsize_t(1), local_offset); + } catch (std::runtime_error& e) { + //Error while reading. + std::ostringstream s; + s << "Cannot read a char.\n"; + s << " - File part is " << part_pair->second->filename() << "\n"; + s << " - File part size is " << part_pair->second->size().v << "\n"; + s << " - File part range is " << part_pair->first.min << "-" << part_pair->first.max << "\n"; + s << " - Reading offset at " << offset.v << "\n"; + s << " - local offset is " << local_offset.v << "\n"; + s << " - error is " << strerror(errno) << "\n"; + std::error_code ec(errno, std::generic_category()); + throw std::system_error(ec, s.str()); + }; + return ret; +} + + +void FileReader::read(char* dest, offset_t offset, zsize_t size) const { + ASSERT(offset.v, <, _size.v); + ASSERT(offset.v+size.v, <=, _size.v); + if (! size ) { + return; + } + offset += _offset; + auto found_range = source->locate(offset, size); + for(auto current = found_range.first; current!=found_range.second; current++){ + auto part = current->second; + Range partRange = current->first; + offset_t local_offset = offset-partRange.min; + ASSERT(size.v, >, 0U); + zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-local_offset.v)); + try { + part->fhandle().readAt(dest, size_to_get, local_offset); + } catch (std::runtime_error& e) { + std::ostringstream s; + s << "Cannot read chars.\n"; + s << " - File part is " << part->filename() << "\n"; + s << " - File part size is " << part->size().v << "\n"; + s << " - File part range is " << partRange.min << "-" << partRange.max << "\n"; + s << " - size_to_get is " << size_to_get.v << "\n"; + s << " - total size is " << size.v << "\n"; + s << " - Reading offset at " << offset.v << "\n"; + s << " - local offset is " << local_offset.v << "\n"; + s << " - error is " << strerror(errno) << "\n"; + std::error_code ec(errno, std::generic_category()); + throw std::system_error(ec, s.str()); + }; + ASSERT(size_to_get, <=, size); + dest += size_to_get.v; + size -= size_to_get; + offset += size_to_get; + } + ASSERT(size.v, ==, 0U); +} + + +std::shared_ptr FileReader::get_buffer(offset_t offset, zsize_t size) const { + ASSERT(size, <=, _size); +#ifdef ENABLE_USE_MMAP + try { + auto found_range = source->locate(_offset+offset, size); + auto first_part_containing_it = found_range.first; + if (++first_part_containing_it != found_range.second) { + throw MMapException(); + } + + // The range is in only one part + auto range = found_range.first->first; + auto part = found_range.first->second; + auto local_offset = offset + _offset - range.min; + ASSERT(size, <=, part->size()); + int fd = part->fhandle().getNativeHandle(); + auto buffer = std::shared_ptr(new MMapBuffer(fd, local_offset, size)); + return buffer; + } catch(MMapException& e) +#endif + { + // The range is several part, or we are on Windows. + // We will have to do some memory copies :/ + // [TODO] Use Windows equivalent for mmap. + char* p = new char[size.v]; + auto ret_buffer = std::shared_ptr(new MemoryBuffer(p, size)); + read(p, offset, size); + return ret_buffer; + } +} + +bool Reader::can_read(offset_t offset, zsize_t size) +{ + return (offset.v <= this->size().v && (offset.v+size.v) <= this->size().v); +} + + +std::shared_ptr Reader::get_clusterBuffer(offset_t offset, CompressionType comp) const +{ + zsize_t uncompressed_size(0); + std::unique_ptr uncompressed_data; + switch (comp) { + case zimcompLzma: + uncompressed_data = uncompress(this, offset, &uncompressed_size); + break; + case zimcompZip: +#if defined(ENABLE_ZLIB) + uncompressed_data = uncompress(this, offset, &uncompressed_size); +#else + throw std::runtime_error("zlib not enabled in this library"); +#endif + break; + case zimcompZstd: +#if defined(ENABLE_ZSTD) + uncompressed_data = uncompress(this, offset, &uncompressed_size); +#else + throw std::runtime_error("zstd not enabled in this library"); +#endif + break; + default: + throw std::logic_error("compressions should not be something else than zimcompLzma, zimComZip or zimcompZstd."); + } + return std::shared_ptr(new MemoryBuffer(uncompressed_data.release(), uncompressed_size)); +} + +std::unique_ptr Reader::sub_clusterReader(offset_t offset, CompressionType* comp, bool* extended) const { + uint8_t clusterInfo = read(offset); + *comp = static_cast(clusterInfo & 0x0F); + *extended = clusterInfo & 0x10; + + switch (*comp) { + case zimcompDefault: + case zimcompNone: + { + auto size = Cluster::read_size(this, *extended, offset + offset_t(1)); + // No compression, just a sub_reader + return sub_reader(offset+offset_t(1), size); + } + break; + case zimcompLzma: + case zimcompZip: + case zimcompZstd: + { + auto buffer = get_clusterBuffer(offset+offset_t(1), *comp); + return std::unique_ptr(new BufferReader(buffer)); + } + break; + case zimcompBzip2: + throw std::runtime_error("bzip2 not enabled in this library"); + default: + throw ZimFileFormatError("Invalid compression flag"); + } +} + +std::unique_ptr FileReader::sub_reader(offset_t offset, zsize_t size) const +{ + ASSERT(size, <=, _size); + return std::unique_ptr(new FileReader(source, _offset+offset, size)); +} + + +//BufferReader::BufferReader(std::shared_ptr source) +// : source(source) {} + +std::shared_ptr BufferReader::get_buffer(offset_t offset, zsize_t size) const +{ + return source->sub_buffer(offset, size); +} + +std::unique_ptr BufferReader::sub_reader(offset_t offset, zsize_t size) const +{ + //auto source_addr = source->data(0); + auto sub_buff = get_buffer(offset, size); + //auto buff_addr = sub_buff->data(0); + std::unique_ptr sub_read(new BufferReader(sub_buff)); + return sub_read; +} + +zsize_t BufferReader::size() const +{ + return source->size(); +} + +offset_t BufferReader::offset() const +{ + return offset_t((offset_type)(static_cast(source->data(offset_t(0))))); +} + + +void BufferReader::read(char* dest, offset_t offset, zsize_t size) const { + ASSERT(offset.v, <, source->size().v); + ASSERT(offset+offset_t(size.v), <=, offset_t(source->size().v)); + if (! size ) { + return; + } + memcpy(dest, source->data(offset), size.v); +} + + +char BufferReader::read(offset_t offset) const { + ASSERT(offset.v, <, source->size().v); + char dest; + dest = *source->data(offset); + return dest; +} + + +} // zim diff --git a/src/file_reader.h b/src/file_reader.h new file mode 100644 index 0000000..18b2f13 --- /dev/null +++ b/src/file_reader.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_READER_H_ +#define ZIM_FILE_READER_H_ + +#include + +#include "zim_types.h" +#include "endian_tools.h" +#include "debug.h" + +namespace zim { + +class Buffer; +class FileCompound; + +class Reader { + public: + Reader() {}; + virtual zsize_t size() const = 0; + virtual ~Reader() {}; + + virtual void read(char* dest, offset_t offset, zsize_t size) const = 0; + template + T read_uint(offset_t offset) const { + ASSERT(offset.v, <, size().v); + ASSERT(offset.v+sizeof(T), <=, size().v); + char tmp_buf[sizeof(T)]; + read(tmp_buf, offset, zsize_t(sizeof(T))); + return fromLittleEndian(tmp_buf); + } + virtual char read(offset_t offset) const = 0; + + virtual std::shared_ptr get_buffer(offset_t offset, zsize_t size) const = 0; + std::shared_ptr get_buffer(offset_t offset) const { + return get_buffer(offset, zsize_t(size().v-offset.v)); + } + virtual std::unique_ptr sub_reader(offset_t offset, zsize_t size) const = 0; + std::unique_ptr sub_reader(offset_t offset) const { + return sub_reader(offset, zsize_t(size().v-offset.v)); + } + virtual offset_t offset() const = 0; + + std::unique_ptr sub_clusterReader(offset_t offset, + CompressionType* comp, + bool* extented) const; + + bool can_read(offset_t offset, zsize_t size); + + private: + std::shared_ptr get_clusterBuffer(offset_t offset, CompressionType comp) const; +}; + +class FileReader : public Reader { + public: + FileReader(std::shared_ptr source); + ~FileReader() {}; + + zsize_t size() const { return _size; }; + offset_t offset() const { return _offset; }; + + char read(offset_t offset) const; + void read(char* dest, offset_t offset, zsize_t size) const; + std::shared_ptr get_buffer(offset_t offset, zsize_t size) const; + + std::unique_ptr sub_reader(offset_t offest, zsize_t size) const; + + private: + FileReader(std::shared_ptr source, offset_t offset); + FileReader(std::shared_ptr source, offset_t offset, zsize_t size); + + std::shared_ptr source; + offset_t _offset; + zsize_t _size; +}; + +class BufferReader : public Reader { + public: + BufferReader(std::shared_ptr source) + : source(source) {} + virtual ~BufferReader() {}; + + zsize_t size() const; + offset_t offset() const; + + void read(char* dest, offset_t offset, zsize_t size) const; + char read(offset_t offset) const; + std::shared_ptr get_buffer(offset_t offset, zsize_t size) const; + std::unique_ptr sub_reader(offset_t offset, zsize_t size) const; + + private: + std::shared_ptr source; +}; + +}; + +#endif // ZIM_FILE_READER_H_ diff --git a/src/fileheader.cpp b/src/fileheader.cpp new file mode 100644 index 0000000..0e48990 --- /dev/null +++ b/src/fileheader.cpp @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2008 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include "log.h" +#include "endian_tools.h" +#include "buffer.h" +#ifdef _WIN32 +# include "io.h" +#else +# include "unistd.h" +# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \ +{throw std::runtime_error("Error writing");} +#endif + +log_define("zim.file.header") + +namespace zim +{ + const uint32_t Fileheader::zimMagic = 0x044d495a; // ="ZIM^d" + const uint16_t Fileheader::zimClassicMajorVersion = 5; + const uint16_t Fileheader::zimExtendedMajorVersion = 6; + const uint16_t Fileheader::zimMinorVersion = 0; + const offset_type Fileheader::size = 80; // This is also mimeListPos (so an offset) + + void Fileheader::write(int out_fd) const + { + char header[Fileheader::size]; + toLittleEndian(Fileheader::zimMagic, header); + toLittleEndian(getMajorVersion(), header + 4); + toLittleEndian(getMinorVersion(), header + 6); + std::copy(getUuid().data, getUuid().data + sizeof(Uuid), header + 8); + toLittleEndian(getArticleCount(), header + 24); + toLittleEndian(getClusterCount(), header + 28); + toLittleEndian(getUrlPtrPos(), header + 32); + toLittleEndian(getTitleIdxPos(), header + 40); + toLittleEndian(getClusterPtrPos(), header + 48); + toLittleEndian(getMimeListPos(), header + 56); + toLittleEndian(getMainPage(), header + 64); + toLittleEndian(getLayoutPage(), header + 68); + toLittleEndian(getChecksumPos(), header + 72); + + _write(out_fd, header, Fileheader::size); + } + + void Fileheader::read(std::shared_ptr buffer) + { + uint32_t magicNumber = buffer->as(offset_t(0)); + if (magicNumber != Fileheader::zimMagic) + { + log_error("invalid magic number " << magicNumber << " found - " + << Fileheader::zimMagic << " expected"); + throw ZimFileFormatError("Invalid magic number"); + } + + uint16_t major_version = buffer->as(offset_t(4)); + if (major_version != zimClassicMajorVersion && major_version != zimExtendedMajorVersion) + { + log_error("invalid zimfile major version " << major_version << " found - " + << Fileheader::zimMajorVersion << " expected"); + throw ZimFileFormatError("Invalid version"); + } + setMajorVersion(major_version); + + setMinorVersion(buffer->as(offset_t(6))); + + Uuid uuid; + std::copy(buffer->data(offset_t(8)), buffer->data(offset_t(24)), uuid.data); + setUuid(uuid); + + setArticleCount(buffer->as(offset_t(24))); + setClusterCount(buffer->as(offset_t(28))); + setUrlPtrPos(buffer->as(offset_t(32))); + setTitleIdxPos(buffer->as(offset_t(40))); + setClusterPtrPos(buffer->as(offset_t(48))); + setMimeListPos(buffer->as(offset_t(56))); + setMainPage(buffer->as(offset_t(64))); + setLayoutPage(buffer->as(offset_t(68))); + setChecksumPos(buffer->as(offset_t(72))); + + sanity_check(); + } + + void Fileheader::sanity_check() const { + if (!!articleCount != !!clusterCount) { + throw ZimFileFormatError("No article <=> No cluster"); + } + + if (mimeListPos != size && mimeListPos != 72) { + throw ZimFileFormatError("mimelistPos must be 80."); + } + + if (urlPtrPos < mimeListPos) { + throw ZimFileFormatError("urlPtrPos must be > mimelistPos."); + } + if (titleIdxPos < mimeListPos) { + throw ZimFileFormatError("titleIdxPos must be > mimelistPos."); + } + if (clusterPtrPos < mimeListPos) { + throw ZimFileFormatError("clusterPtrPos must be > mimelistPos."); + } + + if (clusterCount > articleCount) { + throw ZimFileFormatError("Cluster count cannot be higher than article count."); + } + + if (checksumPos != 0 && checksumPos < mimeListPos) { + throw ZimFileFormatError("checksumPos must be > mimeListPos."); + } + } + +} diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp new file mode 100644 index 0000000..bfa622f --- /dev/null +++ b/src/fileimpl.cpp @@ -0,0 +1,617 @@ +/* + * Copyright (C) 2006,2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fileimpl.h" +#include +#include "_dirent.h" +#include "file_compound.h" +#include "file_reader.h" +#include +#include +#include +#include +#include +#include +#include +#include "config.h" +#include "log.h" +#include "envvalue.h" +#include "md5.h" + +log_define("zim.file.impl") + +namespace zim +{ + ////////////////////////////////////////////////////////////////////// + // FileImpl + // + FileImpl::FileImpl(const std::string& fname) + : zimFile(new FileCompound(fname)), + zimReader(new FileReader(zimFile)), + bufferDirentZone(256), + bufferDirentLock(PTHREAD_MUTEX_INITIALIZER), + filename(fname), + direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)), + direntCacheLock(PTHREAD_MUTEX_INITIALIZER), + clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)), + clusterCacheLock(PTHREAD_MUTEX_INITIALIZER), + cacheUncompressedCluster(envValue("ZIM_CACHEUNCOMPRESSEDCLUSTER", false)), + namespaceBeginLock(PTHREAD_MUTEX_INITIALIZER), + namespaceEndLock(PTHREAD_MUTEX_INITIALIZER) + { + log_trace("read file \"" << fname << '"'); + + if (zimFile->fail()) + throw ZimFileFormatError(std::string("can't open zim-file \"") + fname + '"'); + + filename = fname; + + // read header + if (size_type(zimReader->size()) < Fileheader::size) { + throw ZimFileFormatError("zim-file is too small to contain a header"); + } + try { + header.read(zimReader->get_buffer(offset_t(0), zsize_t(Fileheader::size))); + } catch (ZimFileFormatError& e) { + throw e; + } catch (...) { + throw ZimFileFormatError("error reading zim-file header."); + } + + // urlPtrOffsetReader + zsize_t size(header.getArticleCount() * 8); + if (!zimReader->can_read(offset_t(header.getUrlPtrPos()), size)) { + throw ZimFileFormatError("Reading out of zim file."); + } +#ifdef ENABLE_USE_BUFFER_HEADER + urlPtrOffsetReader = std::unique_ptr(new BufferReader( + zimReader->get_buffer(offset_t(header.getUrlPtrPos()), size))); +#else + urlPtrOffsetReader = zimReader->sub_reader(offset_t(header.getUrlPtrPos()), size); +#endif + + // Create titleIndexBuffer + size = zsize_t(header.getArticleCount() * 4); + if (!zimReader->can_read(offset_t(header.getTitleIdxPos()), size)) { + throw ZimFileFormatError("Reading out of zim file."); + } +#ifdef ENABLE_USE_BUFFER_HEADER + titleIndexReader = std::unique_ptr(new BufferReader( + zimReader->get_buffer(offset_t(header.getTitleIdxPos()), size))); +#else + titleIndexReader = zimReader->sub_reader(offset_t(header.getTitleIdxPos()), size); +#endif + + // clusterOffsetBuffer + size = zsize_t(header.getClusterCount() * 8); + if (!zimReader->can_read(offset_t(header.getClusterPtrPos()), size)) { + throw ZimFileFormatError("Reading out of zim file."); + } +#ifdef ENABLE_USE_BUFFER_HEADER + clusterOffsetReader = std::unique_ptr(new BufferReader( + zimReader->get_buffer(offset_t(header.getClusterPtrPos()), size))); +#else + clusterOffsetReader = zimReader->sub_reader(offset_t(header.getClusterPtrPos()), size); +#endif + + if (!getCountClusters()) + log_warn("no clusters found"); + else + { + offset_t lastOffset = getClusterOffset(cluster_index_t(cluster_index_type(getCountClusters()) - 1)); + log_debug("last offset=" << lastOffset.v << " file size=" << zimFile->fsize().v); + if (lastOffset.v > zimFile->fsize().v) + { + log_fatal("last offset (" << lastOffset << ") larger than file size (" << zimFile->fsize() << ')'); + throw ZimFileFormatError("last cluster offset larger than file size; file corrupt"); + } + } + + if (header.hasChecksum() && header.getChecksumPos() != (zimFile->fsize().v-16) ) { + throw ZimFileFormatError("Checksum position is not valid"); + } + + // read mime types + // libzim write zims files two ways : + // - The old way by putting the urlPtrPos just after the mimetype. + // - The new way by putting the urlPtrPos at the end of the zim files. + // In this case, the cluster data are always at 1024 bytes offset and we know that + // mimetype list is before this. + // 1024 seems to be a good maximum size for the mimetype list, even for the "old" way. + auto endMimeList = std::min(header.getUrlPtrPos(), static_cast(1024)); + size = zsize_t(endMimeList - header.getMimeListPos()); + auto buffer = zimReader->get_buffer(offset_t(header.getMimeListPos()), size); + offset_t current = offset_t(0); + while (current.v < size.v) + { + offset_type len = strlen(buffer->data(current)); + + if (len == 0) { + break; + } + + if (current.v + len >= size.v) { + throw(ZimFileFormatError("Error getting mimelists.")); + } + + std::string mimeType(buffer->data(current), len); + mimeTypes.push_back(mimeType); + + current += (len + 1); + } + } + + + std::pair FileImpl::findx(char ns, const std::string& url) + { + log_debug("find article by url " << ns << " \"" << url << "\", in file \"" << getFilename() << '"'); + + article_index_type l = article_index_type(getNamespaceBeginOffset(ns)); + article_index_type u = article_index_type(getNamespaceEndOffset(ns)); + + if (l == u) + { + log_debug("namespace " << ns << " not found"); + return std::pair(false, article_index_t(0)); + } + + unsigned itcount = 0; + while (u - l > 1) + { + ++itcount; + article_index_type p = l + (u - l) / 2; + auto d = getDirent(article_index_t(p)); + + int c = ns < d->getNamespace() ? -1 + : ns > d->getNamespace() ? 1 + : url.compare(d->getUrl()); + + if (c < 0) + u = p; + else if (c > 0) + l = p; + else + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p); + return std::pair(true, article_index_t(p)); + } + } + + auto d = getDirent(article_index_t(l)); + int c = url.compare(d->getUrl()); + + if (c == 0) + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l); + return std::pair(true, article_index_t(l)); + } + + log_debug("article not found after " << itcount << " iterations (\"" << d.getUrl() << "\" does not match)"); + return std::pair(false, article_index_t(c < 0 ? l : u)); + } + + std::pair FileImpl::findx(const std::string& url) + { + size_t start = 0; + if (url[0] == '/') { + start = 1; + } + if (url.size() < (2+start) || url[1+start] != '/') + return std::pair(false, article_index_t(0)); + return findx(url[start], url.substr(2+start)); + } + + std::pair FileImpl::findxByTitle(char ns, const std::string& title) + { + log_debug("find article by title " << ns << " \"" << title << "\", in file \"" << getFilename() << '"'); + + article_index_type l = article_index_type(getNamespaceBeginOffset(ns)); + article_index_type u = article_index_type(getNamespaceEndOffset(ns)); + + if (l == u) + { + log_debug("namespace " << ns << " not found"); + return std::pair(false, article_index_t(0)); + } + + unsigned itcount = 0; + while (u - l > 1) + { + ++itcount; + article_index_type p = l + (u - l) / 2; + auto d = getDirentByTitle(article_index_t(p)); + + int c = ns < d->getNamespace() ? -1 + : ns > d->getNamespace() ? 1 + : title.compare(d->getTitle()); + + if (c < 0) + u = p; + else if (c > 0) + l = p; + else + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p); + return std::pair(true, article_index_t(p)); + } + } + + auto d = getDirentByTitle(article_index_t(l)); + int c = title.compare(d->getTitle()); + + if (c == 0) + { + log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l); + return std::pair(true, article_index_t(l)); + } + + log_debug("article not found after " << itcount << " iterations (\"" << d.getTitle() << "\" does not match)"); + return std::pair(false, article_index_t(c < 0 ? l : u)); + } + + std::pair FileImpl::findxByClusterOrder(article_index_type idx) + { + std::call_once(orderOnceFlag, [this] + { + auto nb_articles = this->getCountArticles().v; + articleListByCluster.reserve(nb_articles); + + for(zim::article_index_type i = 0; i < nb_articles; i++) + { + // This is the offset of the dirent in the zimFile + auto indexOffset = getOffset(urlPtrOffsetReader.get(), i); + // Get the mimeType of the dirent (offset 0) to know the type of the dirent + uint16_t mimeType = zimReader->read_uint(indexOffset); + if (mimeType==Dirent::redirectMimeType || mimeType==Dirent::linktargetMimeType || mimeType == Dirent::deletedMimeType) { + articleListByCluster.push_back(std::make_pair(0, i)); + } else { + // If it is a classic article, get the clusterNumber (at offset 8) + auto clusterNumber = zimReader->read_uint(indexOffset+offset_t(8)); + articleListByCluster.push_back(std::make_pair(clusterNumber, i)); + } + } + std::sort(articleListByCluster.begin(), articleListByCluster.end()); + }); + + if (idx >= articleListByCluster.size()) + return std::pair(false, article_index_t(0)); + return std::pair(true, article_index_t(articleListByCluster[idx].second)); + } + + std::pair + FileImpl::getFileParts(offset_t offset, zsize_t size) + { + return zimFile->locate(offset, size); + } + + std::shared_ptr FileImpl::getDirent(article_index_t idx) + { + log_trace("FileImpl::getDirent(" << idx << ')'); + + if (idx >= getCountArticles()) + throw ZimFileFormatError("article index out of range"); + + pthread_mutex_lock(&direntCacheLock); + auto v = direntCache.getx(idx); + if (v.first) + { + log_debug("dirent " << idx << " found in cache; hits " + << direntCache.getHits() << " misses " + << direntCache.getMisses() << " ratio " + << direntCache.hitRatio() * 100 << "% fillfactor " + << direntCache.fillfactor()); + pthread_mutex_unlock(&direntCacheLock); + return v.second; + } + + log_debug("dirent " << idx << " not found in cache; hits " + << direntCache.getHits() << " misses " << direntCache.getMisses() + << " ratio " << direntCache.hitRatio() * 100 << "% fillfactor " + << direntCache.fillfactor()); + pthread_mutex_unlock(&direntCacheLock); + + offset_t indexOffset = getOffset(urlPtrOffsetReader.get(), idx.v); + // We don't know the size of the dirent because it depends of the size of + // the title, url and extra parameters. + // This is a pitty but we have no choices. + // We cannot take a buffer of the size of the file, it would be really inefficient. + // Let's do try, catch and retry while chosing a smart value for the buffer size. + // Most dirent will be "Article" entry (header's size == 16) without extra parameters. + // Let's hope that url + title size will be < 256 and if not try again with a bigger size. + + pthread_mutex_lock(&bufferDirentLock); + zsize_t bufferSize = zsize_t(256); + // On very small file, the offset + 256 is higher than the size of the file, + // even if the file is valid. + // So read only to the end of the file. + auto totalSize = zimReader->size(); + if (indexOffset.v + 256 > totalSize.v) bufferSize = zsize_t(totalSize.v-indexOffset.v); + std::shared_ptr dirent; + while (true) { + bufferDirentZone.reserve(size_type(bufferSize)); + zimReader->read(bufferDirentZone.data(), indexOffset, bufferSize); + auto direntBuffer = std::unique_ptr(new MemoryBuffer(bufferDirentZone.data(), bufferSize)); + try { + dirent = std::make_shared(std::move(direntBuffer)); + } catch (InvalidSize&) { + // buffer size is not enougth, try again : + bufferSize += 256; + continue; + } + // Success ! + break; + } + pthread_mutex_unlock(&bufferDirentLock); + + log_debug("dirent read from " << indexOffset); + pthread_mutex_lock(&direntCacheLock); + direntCache.put(idx, dirent); + pthread_mutex_unlock(&direntCacheLock); + + return dirent; + } + + std::shared_ptr FileImpl::getDirentByTitle(article_index_t idx) + { + if (idx >= getCountArticles()) + throw ZimFileFormatError("article index out of range"); + return getDirent(getIndexByTitle(idx)); + } + + article_index_t FileImpl::getIndexByTitle(article_index_t idx) + { + if (idx >= getCountArticles()) + throw ZimFileFormatError("article index out of range"); + + article_index_t ret(titleIndexReader->read_uint( + offset_t(sizeof(article_index_t)*idx.v))); + + return ret; + } + + std::shared_ptr FileImpl::getCluster(cluster_index_t idx) + { + if (idx >= getCountClusters()) + throw ZimFileFormatError("cluster index out of range"); + + pthread_mutex_lock(&clusterCacheLock); + auto cluster(clusterCache.get(idx)); + pthread_mutex_unlock(&clusterCacheLock); + if (cluster) + { + log_debug("cluster " << idx << " found in cache; hits " << clusterCache.getHits() << " misses " << clusterCache.getMisses() << " ratio " << clusterCache.hitRatio() * 100 << "% fillfactor " << clusterCache.fillfactor()); + return cluster; + } + + offset_t clusterOffset(getClusterOffset(idx)); + log_debug("read cluster " << idx << " from offset " << clusterOffset); + CompressionType comp; + bool extended; + std::shared_ptr reader = zimReader->sub_clusterReader(clusterOffset, &comp, &extended); + cluster = std::shared_ptr(new Cluster(reader, comp, extended)); + + log_debug("put cluster " << idx << " into cluster cache; hits " << clusterCache.getHits() << " misses " << clusterCache.getMisses() << " ratio " << clusterCache.hitRatio() * 100 << "% fillfactor " << clusterCache.fillfactor()); + pthread_mutex_lock(&clusterCacheLock); + clusterCache.put(idx, cluster); + pthread_mutex_unlock(&clusterCacheLock); + + return cluster; + } + + offset_t FileImpl::getOffset(const Reader* reader, size_t idx) + { + offset_t offset(reader->read_uint(offset_t(sizeof(offset_type)*idx))); + return offset; + } + + offset_t FileImpl::getClusterOffset(cluster_index_t idx) + { + return getOffset(clusterOffsetReader.get(), idx.v); + } + + offset_t FileImpl::getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx) + { + auto cluster = getCluster(clusterIdx); + if (cluster->isCompressed()) + return offset_t(0); + return getClusterOffset(clusterIdx) + offset_t(1) + cluster->getBlobOffset(blobIdx); + } + + article_index_t FileImpl::getNamespaceBeginOffset(char ch) + { + log_trace("getNamespaceBeginOffset(" << ch << ')'); + + pthread_mutex_lock(&namespaceBeginLock); + NamespaceCache::const_iterator it = namespaceBeginCache.find(ch); + if (it != namespaceBeginCache.end()) + { + article_index_t ret(it->second); + pthread_mutex_unlock(&namespaceBeginLock); + return ret; + } + pthread_mutex_unlock(&namespaceBeginLock); + + article_index_type lower = 0; + article_index_type upper = article_index_type(getCountArticles()); + auto d = getDirent(article_index_t(0)); + while (upper - lower > 1) + { + article_index_type m = lower + (upper - lower) / 2; + auto d = getDirent(article_index_t(m)); + if (d->getNamespace() >= ch) + upper = m; + else + lower = m; + } + + article_index_t ret = article_index_t(d->getNamespace() < ch ? upper : lower); + pthread_mutex_lock(&namespaceBeginLock); + namespaceBeginCache[ch] = ret; + pthread_mutex_unlock(&namespaceBeginLock); + + return ret; + } + + article_index_t FileImpl::getNamespaceEndOffset(char ch) + { + log_trace("getNamespaceEndOffset(" << ch << ')'); + + pthread_mutex_lock(&namespaceEndLock); + NamespaceCache::const_iterator it = namespaceEndCache.find(ch); + if (it != namespaceEndCache.end()) + { + article_index_t ret = it->second; + pthread_mutex_unlock(&namespaceEndLock); + return ret; + } + pthread_mutex_unlock(&namespaceEndLock); + + article_index_type lower = 0; + article_index_type upper = article_index_type(getCountArticles()); + log_debug("namespace " << ch << " lower=" << lower << " upper=" << upper); + while (upper - lower > 1) + { + article_index_type m = lower + (upper - lower) / 2; + auto d = getDirent(article_index_t(m)); + if (d->getNamespace() > ch) + upper = m; + else + lower = m; + log_debug("namespace " << d->getNamespace() << " m=" << m << " lower=" << lower << " upper=" << upper); + } + + pthread_mutex_lock(&namespaceEndLock); + namespaceEndCache[ch] = article_index_t(upper); + pthread_mutex_unlock(&namespaceEndLock); + + return article_index_t(upper); + } + + std::string FileImpl::getNamespaces() + { + std::string namespaces; + + auto d = getDirent(article_index_t(0)); + namespaces = d->getNamespace(); + + article_index_t idx(0); + while ((idx = getNamespaceEndOffset(d->getNamespace())) < getCountArticles()) + { + d = getDirent(idx); + namespaces += d->getNamespace(); + } + + return namespaces; + } + + const std::string& FileImpl::getMimeType(uint16_t idx) const + { + if (idx > mimeTypes.size()) + { + std::ostringstream msg; + msg << "unknown mime type code " << idx; + throw std::runtime_error(msg.str()); + } + + return mimeTypes[idx]; + } + + std::string FileImpl::getChecksum() + { + if (!header.hasChecksum()) + return std::string(); + + std::shared_ptr chksum; + try { + chksum = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); + } catch (...) + { + log_warn("error reading checksum"); + return std::string(); + } + + char hexdigest[33]; + hexdigest[32] = '\0'; + static const char hex[] = "0123456789abcdef"; + char* p = hexdigest; + for (int i = 0; i < 16; ++i) + { + uint8_t v = chksum->at(offset_t(i)); + *p++ = hex[v >> 4]; + *p++ = hex[v & 0xf]; + } + log_debug("chksum=" << hexdigest); + return hexdigest; + } + + bool FileImpl::verify() + { + if (!header.hasChecksum()) + return false; + + struct zim_MD5_CTX md5ctx; + zim_MD5Init(&md5ctx); + + offset_type checksumPos = header.getChecksumPos(); + offset_type currentPos = 0; + for(auto part = zimFile->begin(); + part != zimFile->end(); + part++) { + std::ifstream stream(part->second->filename()); + char ch; + for(/*NOTHING*/ ; currentPos < checksumPos && stream.get(ch).good(); currentPos++) { + zim_MD5Update(&md5ctx, reinterpret_cast(&ch), 1); + } + if (stream.bad()) { + perror("error while reading file"); + return false; + } + if (currentPos == checksumPos) { + break; + } + } + + if (currentPos != checksumPos) { + return false; + } + + unsigned char chksumCalc[16]; + auto chksumFile = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); + + zim_MD5Final(chksumCalc, &md5ctx); + if (std::memcmp(chksumFile->data(), chksumCalc, 16) != 0) + { + return false; + } + + return true; + } + + time_t FileImpl::getMTime() const { + return zimFile->getMTime(); + } + + zim::zsize_t FileImpl::getFilesize() const { + return zimFile->fsize(); + } + + bool FileImpl::is_multiPart() const { + return zimFile->is_multiPart(); + } +} diff --git a/src/fileimpl.h b/src/fileimpl.h new file mode 100644 index 0000000..ecddadb --- /dev/null +++ b/src/fileimpl.h @@ -0,0 +1,122 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILEIMPL_H +#define ZIM_FILEIMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include "cache.h" +#include "_dirent.h" +#include "cluster.h" +#include "buffer.h" +#include "file_reader.h" +#include "file_compound.h" +#include "zim_types.h" + +namespace zim +{ + class FileImpl + { + std::shared_ptr zimFile; + std::shared_ptr zimReader; + std::vector bufferDirentZone; + pthread_mutex_t bufferDirentLock; + Fileheader header; + std::string filename; + + std::unique_ptr titleIndexReader; + std::unique_ptr urlPtrOffsetReader; + std::unique_ptr clusterOffsetReader; + + offset_t getOffset(const Reader* reader, size_t idx); + + Cache> direntCache; + pthread_mutex_t direntCacheLock; + + Cache> clusterCache; + pthread_mutex_t clusterCacheLock; + + bool cacheUncompressedCluster; + typedef std::map NamespaceCache; + + NamespaceCache namespaceBeginCache; + pthread_mutex_t namespaceBeginLock; + NamespaceCache namespaceEndCache; + pthread_mutex_t namespaceEndLock; + + typedef std::vector MimeTypes; + MimeTypes mimeTypes; + + using pair_type = std::pair; + std::vector articleListByCluster; + std::once_flag orderOnceFlag; + + public: + explicit FileImpl(const std::string& fname); + + time_t getMTime() const; + + const std::string& getFilename() const { return filename; } + const Fileheader& getFileheader() const { return header; } + zsize_t getFilesize() const; + + std::pair + getFileParts(offset_t offset, zsize_t size); + std::shared_ptr getDirent(article_index_t idx); + std::shared_ptr getDirentByTitle(article_index_t idx); + article_index_t getIndexByTitle(article_index_t idx); + article_index_t getCountArticles() const { return article_index_t(header.getArticleCount()); } + + + std::pair findx(char ns, const std::string& url); + std::pair findx(const std::string& url); + std::pair findxByTitle(char ns, const std::string& title); + std::pair findxByClusterOrder(article_index_type idx); + + std::shared_ptr getCluster(cluster_index_t idx); + cluster_index_t getCountClusters() const { return cluster_index_t(header.getClusterCount()); } + offset_t getClusterOffset(cluster_index_t idx); + offset_t getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx); + + article_index_t getNamespaceBeginOffset(char ch); + article_index_t getNamespaceEndOffset(char ch); + article_index_t getNamespaceCount(char ns) + { return getNamespaceEndOffset(ns) - getNamespaceBeginOffset(ns); } + + std::string getNamespaces(); + bool hasNamespace(char ch) const; + + const std::string& getMimeType(uint16_t idx) const; + + std::string getChecksum(); + bool verify(); + bool is_multiPart() const; + }; + +} + +#endif // ZIM_FILEIMPL_H + diff --git a/src/fs.h b/src/fs.h new file mode 100644 index 0000000..5736a5e --- /dev/null +++ b/src/fs.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_H_ +#define ZIM_FS_H_ + +#ifdef _WIN32 +# include "fs_windows.h" +#else +# include "fs_unix.h" +#endif + +namespace zim { + +#ifdef _WIN32 +using DEFAULTFS = windows::FS; +#else +using DEFAULTFS = unix::FS; +#endif +}; + +#endif //ZIM_FS_H_ diff --git a/src/fs_unix.cpp b/src/fs_unix.cpp new file mode 100644 index 0000000..145dbc0 --- /dev/null +++ b/src/fs_unix.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fs_unix.h" +#include + +#include +#include +#include +#include +#include +#include + +namespace zim +{ + +namespace unix { + +zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const +{ +#if defined(__APPLE__) || defined(__OpenBSD__) +# define PREAD pread +#else +# define PREAD pread64 +#endif + ssize_t full_size_read = 0; + auto size_to_read = size.v; + auto current_offset = offset.v; + errno = 0; + while (size_to_read > 0) { + auto size_read = PREAD(m_fd, dest, size_to_read, current_offset); + if (size_read == -1) { + return zsize_t(-1); + } + size_to_read -= size_read; + current_offset += size_read; + full_size_read += size_read; + } + return zsize_t(full_size_read); +#undef PREAD +} + +zsize_t FD::getSize() const +{ + struct stat sb; + fstat(m_fd, &sb); + return zsize_t(sb.st_size); +} + +bool FD::seek(offset_t offset) +{ + return static_cast(offset.v) == lseek(m_fd, offset.v, SEEK_SET); +} + +bool FD::close() { + if (m_fd != -1) { + return ::close(m_fd); + } + return -1; +} + +FD FS::openFile(path_t filepath) +{ + int fd = open(filepath.c_str(), O_RDONLY); + if (fd == -1) { + throw std::runtime_error(""); + } + return FD(fd); +} + +bool FS::makeDirectory(path_t path) +{ + return !mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); +} + +void FS::rename(path_t old_path, path_t new_path) +{ + ::rename(old_path.c_str(), new_path.c_str()); +} + +std::string FS::join(path_t base, path_t name) +{ + return base + "/" + name; +} + +bool FS::remove(path_t path) +{ + DIR* dir; + /* It's a directory, remove all its entries first */ + if ((dir = opendir(path.c_str())) != NULL) { + struct dirent* ent; + while ((ent = readdir(dir)) != NULL) { + std::string childName = ent->d_name; + if (childName != "." && childName != "..") { + auto childPath = join(path, childName); + remove(childPath); + } + } + closedir(dir); + return removeDir(path); + } + + /* It's a file */ + else { + return removeFile(path); + } +} + +bool FS::removeDir(path_t path) { + return rmdir(path.c_str()); +} + +bool FS::removeFile(path_t path) { + return ::remove(path.c_str()); +} + + +}; // unix namespace + +}; // zim namespace + diff --git a/src/fs_unix.h b/src/fs_unix.h new file mode 100644 index 0000000..1e79e9e --- /dev/null +++ b/src/fs_unix.h @@ -0,0 +1,90 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_UNIX_H_ +#define ZIM_FS_UNIX_H_ + +#include "zim_types.h" + +#include + +#include +#include +#include +#include +#include + +namespace zim { + +namespace unix { + +using path_t = const std::string&; + +class FD { + public: + using fd_t = int; + + private: + fd_t m_fd = -1; + + public: + FD() = default; + FD(fd_t fd): + m_fd(fd) {}; + FD(const FD& o) = delete; + FD(FD&& o) : + m_fd(o.m_fd) { o.m_fd = -1; } + FD& operator=(FD&& o) { + m_fd = o.m_fd; + o.m_fd = -1; + return *this; + } + ~FD() { close(); } + zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; + zsize_t getSize() const; + fd_t getNativeHandle() const + { + return m_fd; + } + fd_t release() + { + int ret = m_fd; + m_fd = -1; + return ret; + } + bool seek(offset_t offset); + bool close(); +}; + +struct FS { + using FD = zim::unix::FD; + static std::string join(path_t base, path_t name); + static FD openFile(path_t filepath); + static bool makeDirectory(path_t path); + static void rename(path_t old_path, path_t new_path); + static bool remove(path_t path); + static bool removeDir(path_t path); + static bool removeFile(path_t path); +}; + +}; // unix namespace + +}; // zim namespace + +#endif //ZIM_FS_UNIX_H_ diff --git a/src/fs_windows.cpp b/src/fs_windows.cpp new file mode 100644 index 0000000..e4df1e4 --- /dev/null +++ b/src/fs_windows.cpp @@ -0,0 +1,199 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fs_windows.h" +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace zim { + +namespace windows { + +struct ImplFD { + HANDLE m_handle = INVALID_HANDLE_VALUE; + CRITICAL_SECTION m_criticalSection; + + ImplFD() { + InitializeCriticalSection(&m_criticalSection); + } + ImplFD(HANDLE handle) : + m_handle(handle) + { + InitializeCriticalSection(&m_criticalSection); + } + + ~ImplFD() { + DeleteCriticalSection(&m_criticalSection); + } +}; + +FD::FD() : + mp_impl(new ImplFD()) {} + +FD::FD(fd_t handle) : + mp_impl(new ImplFD(handle)) {} + +FD::FD(int fd): + mp_impl(new ImplFD(reinterpret_cast(_get_osfhandle(fd)))) {} + +FD::FD(FD&& o) = default; +FD& FD::operator=(FD&& o) = default; + +FD::~FD() +{ + if (mp_impl) + close(); +} + +zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const +{ + if (!mp_impl) + return zsize_t(-1); + EnterCriticalSection(&mp_impl->m_criticalSection); + LARGE_INTEGER off; + off.QuadPart = offset.v; + if (!SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN)) { + goto err; + } + + DWORD size_read; + if (!ReadFile(mp_impl->m_handle, dest, size.v, &size_read, NULL)) { + goto err; + } + if (size_read != size.v) { + goto err; + } + LeaveCriticalSection(&mp_impl->m_criticalSection); + return size; +err: + LeaveCriticalSection(&mp_impl->m_criticalSection); + return zsize_t(-1); +} + +bool FD::seek(offset_t offset) +{ + if(!mp_impl) + return false; + LARGE_INTEGER off; + off.QuadPart = offset.v; + return SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN); +} + +zsize_t FD::getSize() const +{ + if(!mp_impl) + return zsize_t(0); + LARGE_INTEGER size; + if (!GetFileSizeEx(mp_impl->m_handle, &size)) { + size.QuadPart = 0; + } + return zsize_t(size.QuadPart); +} + +int FD::release() +{ + if(!mp_impl) + return -1; + int ret = _open_osfhandle(reinterpret_cast(mp_impl->m_handle), 0); + mp_impl->m_handle = INVALID_HANDLE_VALUE; + return ret; +} + +bool FD::close() +{ + if (!mp_impl || mp_impl->m_handle == INVALID_HANDLE_VALUE) { + return false; + } + return CloseHandle(mp_impl->m_handle); +} + +std::unique_ptr FS::toWideChar(path_t path) +{ + auto size = MultiByteToWideChar(CP_UTF8, 0, + path.c_str(), -1, nullptr, 0); + auto wdata = std::unique_ptr(new wchar_t[size]); + auto ret = MultiByteToWideChar(CP_UTF8, 0, + path.c_str(), -1, wdata.get(), size); + if (0 == ret) { + std::ostringstream oss; + oss << "Cannot convert path to wchar : " << GetLastError(); + throw std::runtime_error(oss.str()); + } + return wdata; +} + +FD FS::openFile(path_t filepath) +{ + auto wpath = toWideChar(filepath); + FD::fd_t handle; + handle = CreateFileW(wpath.get(), + GENERIC_READ, + FILE_SHARE_READ, + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_READONLY|FILE_FLAG_RANDOM_ACCESS, + NULL); + if (handle == INVALID_HANDLE_VALUE) { + std::ostringstream oss; + oss << "Cannot open file : " << GetLastError(); + throw std::runtime_error(oss.str()); + } + return FD(handle); +} + +bool FS::makeDirectory(path_t path) +{ + auto wpath = toWideChar(path); + auto ret = CreateDirectoryW(wpath.get(), NULL); + return ret; +} + + +void FS::rename(path_t old_path, path_t new_path) +{ + MoveFileW(toWideChar(old_path).get(), toWideChar(new_path).get()); +} + +std::string FS::join(path_t base, path_t name) +{ + return base + "\\" + name; +} + +bool FS::removeDir(path_t path) +{ + return RemoveDirectoryW(toWideChar(path).get()); +} + +bool FS::removeFile(path_t path) +{ + return DeleteFileW(toWideChar(path).get()); +} + +}; // windows namespace + +}; // zim namespace + diff --git a/src/fs_windows.h b/src/fs_windows.h new file mode 100644 index 0000000..60d1062 --- /dev/null +++ b/src/fs_windows.h @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_WINDOWS_H_ +#define ZIM_FS_WINDOWS_H_ + +#include "zim_types.h" + +#include +#include + +typedef void* HANDLE; + +namespace zim { + +namespace windows { + +using path_t = const std::string&; + +struct ImplFD; + +class FD { + public: + typedef HANDLE fd_t; + private: + std::unique_ptr mp_impl; + + public: + FD(); + FD(fd_t handle); + FD(int fd); + FD(const FD& o) = delete; + FD(FD&& o); + FD& operator=(FD&& o); + FD& operator=(const FD& o) = delete; + ~FD(); + zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; + zsize_t getSize() const; + int release(); + bool seek(offset_t offset); + bool close(); +}; + +struct FS { + using FD = zim::windows::FD; + static std::string join(path_t base, path_t name); + static std::unique_ptr toWideChar(path_t path); + static FD openFile(path_t filepath); + static bool makeDirectory(path_t path); + static void rename(path_t old_path, path_t new_path); + static bool remove(path_t path); + static bool removeDir(path_t path); + static bool removeFile(path_t path); +}; + +}; // windows namespace + +}; // zim namespace + +#endif //ZIM_FS_WINDOWS_H_ diff --git a/src/levenshtein.cpp b/src/levenshtein.cpp new file mode 100644 index 0000000..a520c01 --- /dev/null +++ b/src/levenshtein.cpp @@ -0,0 +1,31 @@ + +#include "levenshtein.h" +#include +#include + +int levenshtein_distance(const std::string &s1, const std::string &s2) +{ + int s1len = s1.size(); + int s2len = s2.size(); + + auto column_start = (decltype(s1len))1; + + auto column = new decltype(s1len)[s1len + 1]; + std::iota(column + column_start - 1, column + s1len + 1, column_start - 1); + + for (auto x = column_start; x <= s2len; x++) { + column[0] = x; + auto last_diagonal = x - column_start; + for (auto y = column_start; y <= s1len; y++) { + auto old_diagonal = column[y]; + auto v1 = column[y] + 1; + auto v2 = column[y - 1] + 1; + auto v3 = last_diagonal + (s1[y - 1] == s2[x - 1]? 0 : 1); + column[y] = v1 + +int levenshtein_distance(const std::string &s1, const std::string &s2); + +#endif // LEVENSHTEIN_H diff --git a/src/log.h b/src/log.h new file mode 100644 index 0000000..5fbd81a --- /dev/null +++ b/src/log.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "config.h" + +#ifdef WITH_CXXTOOLS + +#include + +#else + +#define log_define(e) +#define log_fatal(e) +#define log_error(e) +#define log_warn(e) +#define log_info(e) +#define log_debug(e) +#define log_trace(e) +#define log_init() + +#endif diff --git a/src/md5.c b/src/md5.c new file mode 100644 index 0000000..bae002e --- /dev/null +++ b/src/md5.c @@ -0,0 +1,340 @@ +/* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + */ + +/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +rights reserved. + +License to copy and use this software is granted provided that it +is identified as the "RSA Data Security, Inc. MD5 Message-Digest +Algorithm" in all material mentioning or referencing this software +or this function. + +License is also granted to make and use derivative works provided +that such works are identified as "derived from the RSA Data +Security, Inc. MD5 Message-Digest Algorithm" in all material +mentioning or referencing the derived work. + +RSA Data Security, Inc. makes no representations concerning either +the merchantability of this software or the suitability of this +software for any particular purpose. It is provided "as is" +without express or implied warranty of any kind. + +These notices must be retained in any copies of any part of this +documentation and/or software. + */ + +#include "md5.h" +#include + +#define MD5_CTX struct zim_MD5_CTX + +/* Constants for MD5Transform routine. + */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + +static void MD5Transform PROTO_LIST ((UINT4 [4], const unsigned char [64])); +static void Encode PROTO_LIST + ((unsigned char *, UINT4 *, unsigned int)); +static void Decode PROTO_LIST + ((UINT4 *, const unsigned char *, unsigned int)); +/* +static void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int)); +static void MD5_memset PROTO_LIST ((POINTER, int, unsigned int)); +*/ +#define MD5_memcpy memcpy +#define MD5_memset memset + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G, H and I are basic MD5 functions. + */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits. + */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +Rotation is separate from addition to prevent recomputation. + */ +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +/* MD5 initialization. Begins an MD5 operation, writing a new context. + */ +void zim_MD5Init (MD5_CTX* context) +{ + context->count[0] = context->count[1] = 0; + /* Load magic initialization constants. +*/ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* MD5 block update operation. Continues an MD5 message-digest + operation, processing another message block, and updating the + context. + */ +void zim_MD5Update ( +MD5_CTX *context, +const unsigned char *input, /* input block */ +unsigned int inputLen) /* length of input block */ +{ + unsigned int i, index, partLen; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + + /* Update number of bits */ + if ((context->count[0] += ((UINT4)inputLen << 3)) + < ((UINT4)inputLen << 3)) + context->count[1]++; + context->count[1] += ((UINT4)inputLen >> 29); + + partLen = 64 - index; + + /* Transform as many times as possible. +*/ + if (inputLen >= partLen) { + MD5_memcpy + ((POINTER)&context->buffer[index], (POINTER)input, partLen); + MD5Transform (context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + MD5_memcpy + ((POINTER)&context->buffer[index], (POINTER)&input[i], + inputLen-i); +} + +/* MD5 finalization. Ends an MD5 message-digest operation, writing the + the message digest and zeroizing the context. + */ +void zim_MD5Final ( +unsigned char digest[16], /* message digest */ +MD5_CTX *context) /* context */ +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + Encode (bits, context->count, 8); + + /* Pad out to 56 mod 64. +*/ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + zim_MD5Update (context, PADDING, padLen); + + /* Append length (before padding) */ + zim_MD5Update (context, bits, 8); + /* Store state in digest */ + Encode (digest, context->state, 16); + + /* Zeroize sensitive information. +*/ + MD5_memset ((POINTER)context, 0, sizeof (*context)); +} + +/* MD5 basic transformation. Transforms state based on block. + */ +static void MD5Transform ( +UINT4 state[4], +const unsigned char block[64]) +{ + UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. +*/ + MD5_memset ((POINTER)x, 0, sizeof (x)); +} + +/* Encodes input (UINT4) into output (unsigned char). Assumes len is + a multiple of 4. + */ +static void Encode ( +unsigned char *output, +UINT4 *input, +unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } +} + +/* Decodes input (unsigned char) into output (UINT4). Assumes len is + a multiple of 4. + */ +static void Decode ( +UINT4 *output, +const unsigned char *input, +unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | + (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); +} + +#if 0 +/* Note: Replace "for loop" with standard memcpy if possible. + */ + +static void MD5_memcpy ( +POINTER output, +POINTER input, +unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + output[i] = input[i]; +} + +/* Note: Replace "for loop" with standard memset if possible. + */ +static void MD5_memset ( +POINTER output, +int value, +unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + ((char *)output)[i] = (char)value; +} +#endif diff --git a/src/md5.h b/src/md5.h new file mode 100644 index 0000000..29bdc39 --- /dev/null +++ b/src/md5.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2003 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * As a special exception, you may use this file as part of a free + * software library without restriction. Specifically, if other files + * instantiate templates or use macros or inline functions from this + * file, or you compile this file and link it with other files to + * produce an executable, this file does not by itself cause the + * resulting executable to be covered by the GNU General Public + * License. This exception does not however invalidate any other + * reasons why the executable file might be covered by the GNU Library + * General Public License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +rights reserved. + +License to copy and use this software is granted provided that it +is identified as the "RSA Data Security, Inc. MD5 Message-Digest +Algorithm" in all material mentioning or referencing this software +or this function. + +License is also granted to make and use derivative works provided +that such works are identified as "derived from the RSA Data +Security, Inc. MD5 Message-Digest Algorithm" in all material +mentioning or referencing the derived work. + +RSA Data Security, Inc. makes no representations concerning either +the merchantability of this software or the suitability of this +software for any particular purpose. It is provided "as is" +without express or implied warranty of any kind. + +These notices must be retained in any copies of any part of this +documentation and/or software. + */ + +/* RSAREF types and constants + */ + +/* PROTOTYPES should be set to one if and only if the compiler supports + function argument prototyping. +The following makes PROTOTYPES default to 0 if it has not already + been defined with C compiler flags. + */ + +#ifndef ZIM_MD5_H +#define ZIM_MD5_H + +#ifndef PROTOTYPES +#define PROTOTYPES 1 +#endif + +/* POINTER defines a generic pointer type */ +typedef unsigned char *POINTER; + +/* UINT2 defines a two byte word */ +typedef unsigned short int UINT2; + +/* UINT4 defines a four byte word */ +typedef unsigned int UINT4; + +/* PROTO_LIST is defined depending on how PROTOTYPES is defined above. + If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it + returns an empty list. + */ + +#if PROTOTYPES +#define PROTO_LIST(list) list +#else +#define PROTO_LIST(list) () +#endif + +/* MD5 context. */ +struct zim_MD5_CTX { + UINT4 state[4]; /* state (ABCD) */ + UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ + unsigned char buffer[64]; /* input buffer */ +}; + +#ifdef __cplusplus +extern "C" { +#endif + +void zim_MD5Init PROTO_LIST ((struct zim_MD5_CTX *)); +void zim_MD5Update PROTO_LIST + ((struct zim_MD5_CTX *, const unsigned char *, unsigned int)); +void zim_MD5Final PROTO_LIST ((unsigned char [16], struct zim_MD5_CTX *)); + +#ifdef __cplusplus +} +#endif + +#endif /* ZIM_MD5_H */ diff --git a/src/meson.build b/src/meson.build new file mode 100644 index 0000000..4814914 --- /dev/null +++ b/src/meson.build @@ -0,0 +1,74 @@ + +configure_file(output : 'config.h', + configuration : conf, + input : 'config.h.in') + +src_directory = include_directories('.') + +common_sources = [ +# 'config.h', + 'article.cpp', + 'cluster.cpp', + 'dirent.cpp', + 'envvalue.cpp', + 'file.cpp', + 'fileheader.cpp', + 'fileimpl.cpp', + 'file_compound.cpp', + 'file_reader.cpp', + 'blob.cpp', + 'buffer.cpp', + 'md5.c', + 'search.cpp', + 'search_iterator.cpp', + 'template.cpp', + 'uuid.cpp', + 'levenshtein.cpp', + 'tools.cpp', + 'compression.cpp', + 'writer/creator.cpp', + 'writer/article.cpp', + 'writer/cluster.cpp', + 'writer/dirent.cpp', + 'writer/workers.cpp', + 'writer/xapianIndexer.cpp' +] + +if host_machine.system() == 'windows' + common_sources += 'fs_windows.cpp' +else + common_sources += 'fs_unix.cpp' +endif + +xapian_sources = [ + 'xapian/htmlparse.cc', + 'xapian/myhtmlparse.cc' +] + +sources = common_sources +deps = [thread_dep, lzma_dep] + +if zlib_dep.found() + deps += [zlib_dep] +endif + +if zstd_dep.found() + deps += [zstd_dep] +endif + +if xapian_dep.found() + sources += xapian_sources + sources += lib_resources + deps += [xapian_dep, icu_dep] +endif + +libzim = library('zim', + sources, + include_directories : inc, + dependencies : deps, + link_args : extra_link_args, + cpp_args : extra_cpp_args, + version: meson.project_version(), + install : true) +libzim_dep = declare_dependency(link_with: libzim, + include_directories: include_directory) diff --git a/src/search.cpp b/src/search.cpp new file mode 100644 index 0000000..a1296a3 --- /dev/null +++ b/src/search.cpp @@ -0,0 +1,443 @@ +/* + * Copyright (C) 2007 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include "search_internal.h" +#include "levenshtein.h" +#include "fs.h" + +#include + +#include +#include +#if !defined(_WIN32) +# include +#else +# include +#endif +#include + +#if defined(ENABLE_XAPIAN) +#include "xapian.h" +#include +#endif + +#define MAX_MATCHES_TO_SORT 10000 + +namespace zim +{ + +#if defined(ENABLE_XAPIAN) +namespace { +/* Split string in a token array */ +std::vector split(const std::string & str, + const std::string & delims=" *-") +{ + std::string::size_type lastPos = str.find_first_not_of(delims, 0); + std::string::size_type pos = str.find_first_of(delims, lastPos); + std::vector tokens; + + while (std::string::npos != pos || std::string::npos != lastPos) + { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(delims, pos); + pos = str.find_first_of(delims, lastPos); + } + + return tokens; +} + +std::map read_valuesmap(const std::string &s) { + std::map result; + std::vector elems = split(s, ";"); + for(std::vector::iterator elem = elems.begin(); + elem != elems.end(); + elem++) + { + std::vector tmp_elems = split(*elem, ":"); + result.insert( std::pair(tmp_elems[0], atoi(tmp_elems[1].c_str())) ); + } + return result; +} + + +void +setup_queryParser(Xapian::QueryParser* queryparser, + Xapian::Database& database, + const std::string& language, + const std::string& stopwords, + bool newSuggestionFormat) { + queryparser->set_default_op(Xapian::Query::op::OP_AND); + queryparser->set_database(database); + if ( ! language.empty() ) + { + /* Build ICU Local object to retrieve ISO-639 language code (from + ISO-639-3) */ + icu::Locale languageLocale(language.c_str()); + + /* Configuring language base steemming */ + try { + Xapian::Stem stemmer = Xapian::Stem(languageLocale.getLanguage()); + queryparser->set_stemmer(stemmer); + queryparser->set_stemming_strategy( + newSuggestionFormat ? Xapian::QueryParser::STEM_SOME : Xapian::QueryParser::STEM_ALL); + } catch (...) { + std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl; + } + } + + if ( ! stopwords.empty() ) + { + std::string stopWord; + std::istringstream file(stopwords); + Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper(); + while (std::getline(file, stopWord, '\n')) { + stopper->add(stopWord); + } + stopper->release(); + queryparser->set_stopper(stopper); + } +} + +class LevenshteinDistanceMaker : public Xapian::KeyMaker { + public: + LevenshteinDistanceMaker(const std::string& query, size_t value_index): + query(query), + value_index(value_index) {} + ~LevenshteinDistanceMaker() = default; + + virtual std::string operator() (const Xapian::Document &doc) const { + auto document_value = doc.get_value(value_index); + return Xapian::sortable_serialise( + levenshtein_distance(document_value, query)); + } + private: + std::string query; + size_t value_index; +}; + +} +#endif + +Search::Search(const std::vector zimfiles) : + internal(new InternalData), + zimfiles(zimfiles), + prefixes(""), query(""), + latitude(0), longitude(0), distance(0), + range_start(0), range_end(0), + suggestion_mode(false), + geo_query(false), + search_started(false), + has_database(false), + verbose(false), + estimated_matches_number(0) +{} + +Search::Search(const File* zimfile) : + internal(new InternalData), + prefixes(""), query(""), + latitude(0), longitude(0), distance(0), + range_start(0), range_end(0), + suggestion_mode(false), + geo_query(false), + search_started(false), + has_database(false), + verbose(false), + estimated_matches_number(0) +{ + zimfiles.push_back(zimfile); +} + +Search::Search(const Search& it) : + internal(new InternalData), + zimfiles(it.zimfiles), + prefixes(it.prefixes), + query(it.query), + latitude(it.latitude), longitude(it.longitude), distance(it.distance), + range_start(it.range_start), range_end(it.range_end), + suggestion_mode(it.suggestion_mode), + geo_query(it.geo_query), + search_started(false), + has_database(false), + verbose(it.verbose), + estimated_matches_number(0) +{ } + +Search& Search::operator=(const Search& it) +{ + if ( internal ) internal.reset(); + zimfiles = it.zimfiles; + prefixes = it.prefixes; + query = it.query; + latitude = it.latitude; + longitude = it.longitude; + distance = it.distance; + range_start = it.range_start; + range_end = it.range_end; + suggestion_mode = it.suggestion_mode; + geo_query = it.geo_query; + search_started = false; + has_database = false; + verbose = it.verbose; + estimated_matches_number = 0; + return *this; +} + +Search::Search(Search&& it) = default; +Search& Search::operator=(Search&& it) = default; +Search::~Search() = default; + +void Search::set_verbose(bool verbose) { + this->verbose = verbose; +} + +Search& Search::add_zimfile(const File* zimfile) { + zimfiles.push_back(zimfile); + return *this; +} + +Search& Search::set_query(const std::string& query) { + this->query = query; + return *this; +} + +Search& Search::set_georange(float latitude, float longitude, float distance) { + this->latitude = latitude; + this->longitude = longitude; + this->distance = distance; + geo_query = true; + return *this; +} + +Search& Search::set_range(int start, int end) { + this->range_start = start; + this->range_end = end; + return *this; +} + +Search& Search::set_suggestion_mode(const bool suggestion_mode) { + this->suggestion_mode = suggestion_mode; + return *this; +} + +#define WITH_LEV 1 + +Search::iterator Search::begin() const { +#if defined(ENABLE_XAPIAN) + if ( this->search_started ) { + return new search_iterator::InternalData(this, internal->results.begin()); + } + + std::vector::const_iterator it; + bool first = true; + bool hasNewSuggestionFormat = false; + std::string language; + std::string stopwords; + for(it=zimfiles.begin(); it!=zimfiles.end(); it++) + { + const File* zimfile = *it; + if (zimfile->is_multiPart()) { + continue; + } + zim::Article xapianArticle; + if (suggestion_mode) { + xapianArticle = zimfile->getArticle('X', "title/xapian"); + if (xapianArticle.good()) { + hasNewSuggestionFormat = true; + } + } + if (!xapianArticle.good()) { + xapianArticle = zimfile->getArticle('X', "fulltext/xapian"); + } + if (!xapianArticle.good()) { + xapianArticle = zimfile->getArticle('Z', "/fulltextIndex/xapian"); + } + if (!xapianArticle.good()) { + continue; + } + auto dbOffset = xapianArticle.getOffset(); + if (dbOffset == 0) { + continue; + } + DEFAULTFS::FD databasefd; + try { + databasefd = DEFAULTFS::openFile(zimfile->getFilename()); + } catch (...) { + std::cerr << "Impossible to open " << zimfile->getFilename() << std::endl; + std::cerr << strerror(errno) << std::endl; + continue; + } + if (!databasefd.seek(offset_t(dbOffset))) { + std::cerr << "Something went wrong seeking databasedb " + << zimfile->getFilename() << std::endl; + std::cerr << "dbOffest = " << dbOffset << std::endl; + continue; + } + Xapian::Database database; + try { + database = Xapian::Database(databasefd.release()); + } catch( Xapian::DatabaseError& e) { + std::cerr << "Something went wrong opening xapian database for zimfile " + << zimfile->getFilename() << std::endl; + std::cerr << "dbOffest = " << dbOffset << std::endl; + std::cerr << "error = " << e.get_msg() << std::endl; + continue; + } + + if ( first ) { + this->valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + language = database.get_metadata("language"); + if (language.empty() ) { + // Database created before 2017/03 has no language metadata. + // However, term were stemmed anyway and we need to stem our + // search query the same the database was created. + // So we need a language, let's use the one of the zim. + // If zimfile has no language metadata, we can't do lot more here :/ + auto article = zimfile->getArticle('M', "Language"); + if ( article.good() ) { + language = article.getData(); + } + } + stopwords = database.get_metadata("stopwords"); + this->prefixes = database.get_metadata("prefixes"); + } else { + std::map valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + if (this->valuesmap != valuesmap ) { + // [TODO] Ignore the database, raise a error ? + } + } + internal->xapian_databases.push_back(database); + internal->database.add_database(database); + has_database = true; + } + + if ( ! has_database ) { + if (verbose) { + std::cout << "No database, no result" << std::endl; + } + estimated_matches_number = 0; + return nullptr; + } + + Xapian::QueryParser* queryParser = new Xapian::QueryParser(); + if (verbose) { + std::cout << "Setup queryparser using language " << language << std::endl; + } + setup_queryParser(queryParser, internal->database, language, stopwords, hasNewSuggestionFormat); + + std::string prefix = ""; + unsigned flags = Xapian::QueryParser::FLAG_DEFAULT; + if (suggestion_mode) { + if (verbose) { + std::cout << "Mark query as 'partial'" << std::endl; + } + flags |= Xapian::QueryParser::FLAG_PARTIAL; + if ( !hasNewSuggestionFormat + && this->prefixes.find("S") != std::string::npos ) { + if (verbose) { + std::cout << "Searching in title namespace" << std::endl; + } + prefix = "S"; + } + } + Xapian::Query query; + try { + query = queryParser->parse_query(this->query, flags, prefix); + } catch (Xapian::QueryParserError& e) { + estimated_matches_number = 0; + return nullptr; + } + if (verbose) { + std::cout << "Parsed query '" << this->query << "' to " << query.get_description() << std::endl; + } + delete queryParser; + + Xapian::Enquire enquire(internal->database); +#if WITH_LEV + std::unique_ptr keyMaker(nullptr); +#endif + + if (geo_query && valuesmap.find("geo.position") != valuesmap.end()) { + Xapian::GreatCircleMetric metric; + Xapian::LatLongCoord centre(latitude, longitude); + Xapian::LatLongDistancePostingSource ps(valuesmap["geo.position"], centre, metric, distance); + if ( this->query.empty()) { + query = Xapian::Query(&ps); + } else { + query = Xapian::Query(Xapian::Query::OP_FILTER, query, Xapian::Query(&ps)); + } + } + + enquire.set_query(query); + +#if WITH_LEV + if (suggestion_mode && !hasNewSuggestionFormat) { + size_t value_index = 0; + bool has_custom_distance_maker = true; + if ( !valuesmap.empty() ) { + if ( valuesmap.find("title") != valuesmap.end() ) { + value_index = valuesmap["title"]; + } else { + // This should not happen as valuesmap has a title entry, but let's + // be tolerent. + has_custom_distance_maker = false; + } + } + auto temp_results = enquire.get_mset(0,0); + if ( has_custom_distance_maker + && temp_results.get_matches_estimated() <= MAX_MATCHES_TO_SORT ) { + keyMaker.reset(new LevenshteinDistanceMaker(this->query, value_index)); + enquire.set_sort_by_key(keyMaker.get(), false); + } + } +#endif + + if (suggestion_mode && valuesmap.find("title") != valuesmap.end()) { + enquire.set_collapse_key(valuesmap["title"]); + } + + internal->results = enquire.get_mset(this->range_start, this->range_end-this->range_start); + search_started = true; + estimated_matches_number = internal->results.get_matches_estimated(); + return new search_iterator::InternalData(this, internal->results.begin()); +#else + estimated_matches_number = 0; + return nullptr; +#endif +} + +Search::iterator Search::end() const { +#if defined(ENABLE_XAPIAN) + if ( ! has_database ) { + return nullptr; + } + return new search_iterator::InternalData(this, internal->results.end()); +#else + return nullptr; +#endif +} + +int Search::get_matches_estimated() const { + // Ensure that the search as begin + begin(); + return estimated_matches_number; +} + +} //namespace zim diff --git a/src/search_internal.h b/src/search_internal.h new file mode 100644 index 0000000..8781463 --- /dev/null +++ b/src/search_internal.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_INTERNAL_H +#define ZIM_SEARCH_INTERNAL_H + +#include "config.h" + +#if defined(ENABLE_XAPIAN) +#include +#endif + +namespace zim { + +struct Search::InternalData { +#if defined(ENABLE_XAPIAN) + std::vector xapian_databases; + Xapian::Database database; + Xapian::MSet results; +#endif +}; + +struct search_iterator::InternalData { +#if defined(ENABLE_XAPIAN) + const Search* search; + Xapian::MSetIterator iterator; + Xapian::Document _document; + bool document_fetched; +#endif + Article _article; + bool article_fetched; + + +#if defined(ENABLE_XAPIAN) + InternalData(const Search* search, Xapian::MSetIterator iterator) : + search(search), + iterator(iterator), + document_fetched(false), + article_fetched(false) + {}; + + Xapian::Document get_document() { + if ( !document_fetched ) { + if (iterator != search->internal->results.end()) { + _document = iterator.get_document(); + } + document_fetched = true; + } + return _document; + } +#endif + + int get_databasenumber() { +#if defined(ENABLE_XAPIAN) + Xapian::docid docid = *iterator; + return (docid - 1) % search->zimfiles.size(); +#endif + return 0; + } + + Article& get_article() { +#if defined(ENABLE_XAPIAN) + if ( !article_fetched ) { + int databasenumber = get_databasenumber(); + const File* file = search->zimfiles[databasenumber]; + if ( ! file ) + _article = Article(); + else + _article = file->getArticleByUrl(get_document().get_data()); + article_fetched = true; + } +#endif + return _article; + } +}; + + + +}; //namespace zim + +#endif //ZIM_SEARCH_INTERNAL_H diff --git a/src/search_iterator.cpp b/src/search_iterator.cpp new file mode 100644 index 0000000..c950305 --- /dev/null +++ b/src/search_iterator.cpp @@ -0,0 +1,239 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "xapian/myhtmlparse.h" +#include +#include +#include +#include "search_internal.h" + +namespace zim { + + +search_iterator::~search_iterator() = default; +search_iterator::search_iterator(search_iterator&& it) = default; +search_iterator& search_iterator::operator=(search_iterator&& it) = default; + +search_iterator::search_iterator() : search_iterator(nullptr) +{}; + +search_iterator::search_iterator(InternalData* internal_data) + : internal(internal_data) +{} + +search_iterator::search_iterator(const search_iterator& it) + : internal(nullptr) +{ + if (it.internal) internal = std::unique_ptr(new InternalData(*it.internal)); +} + +search_iterator & search_iterator::operator=(const search_iterator& it) { + if ( ! it.internal ) internal.reset(); + else if ( ! internal ) internal = std::unique_ptr(new InternalData(*it.internal)); + else *internal = *it.internal; + + return *this; +} + +bool search_iterator::operator==(const search_iterator& it) const { +#if defined(ENABLE_XAPIAN) + if ( ! internal && ! it.internal) + return true; + if ( ! internal || ! it.internal) + return false; + return (internal->search == it.internal->search + && internal->iterator == it.internal->iterator); +#else + // If there is no xapian, there is no search. There is only one iterator: end. + // So all iterators are equal. + return true; +#endif +} + +bool search_iterator::operator!=(const search_iterator& it) const { + return ! (*this == it); +} + +search_iterator& search_iterator::operator++() { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return *this; + } + ++(internal->iterator); + internal->document_fetched = false; + internal->article_fetched = false; +#endif + return *this; +} + +search_iterator search_iterator::operator++(int) { + search_iterator it = *this; + operator++(); + return it; +} + +search_iterator& search_iterator::operator--() { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return *this; + } + --(internal->iterator); + internal->document_fetched = false; + internal->article_fetched = false; +#endif + return *this; +} + +search_iterator search_iterator::operator--(int) { + search_iterator it = *this; + operator--(); + return it; +} + +std::string search_iterator::get_url() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return ""; + } + return internal->get_document().get_data(); +#else + return ""; +#endif +} + +std::string search_iterator::get_title() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return ""; + } + if ( internal->search->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(0); + } + else if ( internal->search->valuesmap.find("title") != internal->search->valuesmap.end() ) + { + return internal->get_document().get_value(internal->search->valuesmap["title"]); + } +#endif + return ""; +} + +int search_iterator::get_score() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return 0; + } + return internal->iterator.get_percent(); +#else + return 0; +#endif +} + +std::string search_iterator::get_snippet() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return ""; + } + if ( internal->search->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + std::string stored_snippet = internal->get_document().get_value(1); + if ( ! stored_snippet.empty() ) + return stored_snippet; + /* Let's continue here, and see if we can genenate one */ + } + else if ( internal->search->valuesmap.find("snippet") != internal->search->valuesmap.end() ) + { + return internal->get_document().get_value(internal->search->valuesmap["snippet"]); + } + /* No reader, no snippet */ + Article& article = internal->get_article(); + if ( ! article.good() ) + return ""; + /* Get the content of the article to generate a snippet. + We parse it and use the html dump to avoid remove html tags in the + content and be able to nicely cut the text at random place. */ + zim::MyHtmlParser htmlParser; + std::string content = article.getData(); + try { + htmlParser.parse_html(content, "UTF-8", true); + } catch (...) {} + return internal->search->internal->results.snippet(htmlParser.dump, 500); +#else + return ""; +#endif +} + +int search_iterator::get_size() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return -1; + } + if ( internal->search->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(2).empty() == true ? -1 : atoi(internal->get_document().get_value(2).c_str()); + } + else if ( internal->search->valuesmap.find("size") != internal->search->valuesmap.end() ) + { + return atoi(internal->get_document().get_value(internal->search->valuesmap["size"]).c_str()); + } +#endif + /* The size is never used. Do we really want to get the content and + calculate the size ? */ + return -1; +} + +int search_iterator::get_wordCount() const { +#if defined(ENABLE_XAPIAN) + if ( ! internal ) { + return -1; + } + if ( internal->search->valuesmap.empty() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str()); + } + else if ( internal->search->valuesmap.find("wordcount") != internal->search->valuesmap.end() ) + { + return atoi(internal->get_document().get_value(internal->search->valuesmap["wordcount"]).c_str()); + } +#endif + return -1; +} + +int search_iterator::get_fileIndex() const { +#if defined(ENABLE_XAPIAN) + if ( internal ) { + return internal->get_databasenumber(); + } +#endif + return 0; +} + +search_iterator::reference search_iterator::operator*() const { + return internal->get_article(); +} + +search_iterator::pointer search_iterator::operator->() const { + return &internal->get_article(); +} + +} // namespace zim diff --git a/src/template.cpp b/src/template.cpp new file mode 100644 index 0000000..75e4bb8 --- /dev/null +++ b/src/template.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "template.h" + +namespace zim +{ + void TemplateParser::state_data(char ch) + { + data += ch; + + if (ch == '<') + { + state = &TemplateParser::state_lt; + save = data.size() - 1; + } + } + + void TemplateParser::state_lt(char ch) + { + data += ch; + + if (ch == '%') + state = &TemplateParser::state_token0; + else + state = &TemplateParser::state_data; + } + + void TemplateParser::state_token0(char ch) + { + data += ch; + + if (ch == '/') + state = &TemplateParser::state_link0; + else + { + token = data.size() - 1; + state = &TemplateParser::state_token; + } + } + + void TemplateParser::state_token(char ch) + { + data += ch; + + if (ch == '%') + state = &TemplateParser::state_token_end; + } + + void TemplateParser::state_token_end(char ch) + { + if (ch == '>') + { + if (event) + { + event->onData(data.substr(0, save)); + event->onToken(data.substr(token, data.size() - token - 1)); + data.clear(); + } + + state = &TemplateParser::state_data; + } + else + { + data += ch; + state = &TemplateParser::state_data; + } + } + + void TemplateParser::state_link0(char ch) + { + data += ch; + + ns = ch; + state = &TemplateParser::state_link; + } + + void TemplateParser::state_link(char ch) + { + data += ch; + + if (ch == '/') + { + token = data.size(); + state = &TemplateParser::state_title; + } + else + state = &TemplateParser::state_data; + } + + void TemplateParser::state_title(char ch) + { + data += ch; + + if (ch == '%') + { + token_e = data.size() - 1; + state = &TemplateParser::state_title_end; + } + } + + void TemplateParser::state_title_end(char ch) + { + data += ch; + + if (ch == '>') + { + if (event) + { + event->onData(data.substr(0, save)); + event->onLink(ns, data.substr(token, token_e - token)); + } + + data.clear(); + state = &TemplateParser::state_data; + } + } + + void TemplateParser::flush() + { + if (event) + event->onData(data); + data.clear(); + state = &TemplateParser::state_data; + } +} diff --git a/src/template.h b/src/template.h new file mode 100644 index 0000000..116be10 --- /dev/null +++ b/src/template.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_TEMPLATE_H +#define ZIM_TEMPLATE_H + +#include + +namespace zim +{ + class TemplateParser + { + public: + class Event + { + public: + virtual void onData(const std::string& data) = 0; + virtual void onToken(const std::string& token) = 0; + virtual void onLink(char ns, const std::string& url) = 0; + virtual ~Event() = default; + }; + + private: + Event* event; + + std::string data; + std::string::size_type save; + std::string::size_type token; + std::string::size_type token_e; + char ns; + typedef void (TemplateParser::*state_type)(char); + + state_type state; + + void state_data(char ch); + void state_lt(char ch); + void state_token0(char ch); + void state_token(char ch); + void state_token_end(char ch); + void state_link0(char ch); + void state_link(char ch); + void state_title(char ch); + void state_title_end(char ch); + + public: + explicit TemplateParser(Event* ev) + : event(ev), + state(&TemplateParser::state_data) + { } + + void parse(char ch) + { + (this->*state)(ch); + } + + void parse(const std::string& s) + { + for (std::string::const_iterator ch = s.begin(); ch != s.end(); ++ch) + parse(*ch); + } + + void flush(); + }; +} + +#endif // ZIM_TEMPLATE_H diff --git a/src/tools.cpp b/src/tools.cpp new file mode 100644 index 0000000..6539d7d --- /dev/null +++ b/src/tools.cpp @@ -0,0 +1,77 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart + * Copyright 2016 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "tools.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#ifdef _WIN32 +# include +# include +# include +# include +# define SEPARATOR "\\" +#else +# include +# define SEPARATOR "/" +#endif + +#ifdef __MINGW32__ +# include +#else +# include +# include +#endif + + +std::string zim::removeAccents(const std::string& text) +{ + ucnv_setDefaultName("UTF-8"); + static UErrorCode status = U_ZERO_ERROR; + static std::unique_ptr removeAccentsTrans(icu::Transliterator::createInstance( + "Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status)); + icu::UnicodeString ustring(text.c_str()); + removeAccentsTrans->transliterate(ustring); + std::string unaccentedText; + ustring.toUTF8String(unaccentedText); + return unaccentedText; +} + + +void zim::microsleep(int microseconds) { +#ifdef __MINGW32__ + struct timespec wait = {0, 0}; + wait.tv_sec = microseconds / 1000000; + wait.tv_nsec = (microseconds - wait.tv_sec*10000) * 1000; + nanosleep(&wait, nullptr); +#else + std::this_thread::sleep_for(std::chrono::microseconds(microseconds)); +#endif +} diff --git a/src/tools.h b/src/tools.h new file mode 100644 index 0000000..1a58e7b --- /dev/null +++ b/src/tools.h @@ -0,0 +1,32 @@ +/* + * Copyright 2013-2016 Emmanuel Engelhart + * Copyright 2016 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_TOOLS_H +#define OPENZIM_LIBZIM_TOOLS_H + +#include + +namespace zim { + + std::string removeAccents(const std::string& text); + void microsleep(int microseconds); +} + +#endif // OPENZIM_LIBZIM_TOOLS_H diff --git a/src/uuid.cpp b/src/uuid.cpp new file mode 100644 index 0000000..80da56b --- /dev/null +++ b/src/uuid.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include // necessary to have the new types +#include "log.h" +#include "md5.h" + +#ifdef _WIN32 + +# include +# include +int gettimeofday(struct timeval* tp, void* tzp) { + DWORD t; + t = timeGetTime(); + tp->tv_sec = t / 1000; + tp->tv_usec = t % 1000; + return 0; +} + +#define getpid GetCurrentProcessId + +#else +# include +#endif + +log_define("zim.uuid") + +namespace zim +{ + namespace + { + char hex[] = "0123456789abcdef"; + inline char hi(char v) + { return hex[(v >> 4) & 0xf]; } + + inline char lo(char v) + { return hex[v & 0xf]; } + } + + Uuid Uuid::generate(std::string value) + { + Uuid ret; + struct zim_MD5_CTX md5ctx; + zim_MD5Init(&md5ctx); + + if ( value.empty() ) { + struct timeval tv; + gettimeofday(&tv, 0); + + clock_t c = clock(); + + zim_MD5Update(&md5ctx, reinterpret_cast(&c), sizeof(clock_t)); + zim_MD5Update(&md5ctx, reinterpret_cast(&tv), sizeof(struct timeval)); + } else { + zim_MD5Update(&md5ctx, reinterpret_cast(value.data()), value.size()); + } + zim_MD5Final(reinterpret_cast(&ret.data[0]), &md5ctx); + + log_debug("generated uuid: " << ret.data); + + return ret; + } + + std::ostream& operator<< (std::ostream& out, const Uuid& uuid) + { + for (unsigned n = 0; n < 4; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 4; n < 6; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 6; n < 8; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 8; n < 10; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 10; n < 16; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + return out; + } + +} diff --git a/src/writer/_dirent.h b/src/writer/_dirent.h new file mode 100644 index 0000000..e7338ee --- /dev/null +++ b/src/writer/_dirent.h @@ -0,0 +1,179 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_DIRENT_H +#define ZIM_WRITER_DIRENT_H + +#include "cluster.h" + +#include "debug.h" + +namespace zim +{ + namespace writer { + class Dirent; + struct DirectInfo { + DirectInfo() : + clusterNumber(0), + blobNumber(0) + {}; + cluster_index_t clusterNumber; + blob_index_t blobNumber; + }; + + struct RedirectInfo { + const Dirent* redirectDirent = nullptr; + }; + + union DirentInfo { + DirectInfo d; + RedirectInfo r; + }; + + class Dirent + { + static const uint16_t redirectMimeType = 0xffff; + static const uint16_t linktargetMimeType = 0xfffe; + static const uint16_t deletedMimeType = 0xfffd; + static const uint32_t version = 0; + + uint16_t mimeType; + DirentInfo info {}; + Url url; + std::string title; + Cluster* cluster = nullptr; + Url redirectUrl; + article_index_t idx = article_index_t(0); + offset_t offset; + + public: + Dirent() + : mimeType(0), + url(), + title(), + redirectUrl() + { + info.d.clusterNumber = cluster_index_t(0); + info.d.blobNumber = blob_index_t(0); + } + + explicit Dirent(Url url_ ) + : Dirent() + { url = url_; } + + char getNamespace() const { return url.getNs(); } + const std::string& getTitle() const { return title.empty() ? url.getUrl() : title; } + void setTitle(const std::string& title_) { title = title_; } + const std::string& getUrl() const { return url.getUrl(); } + const Url& getFullUrl() const { return url; } + void setUrl(Url url_) { + url = url_; + } + + uint32_t getVersion() const { return version; } + + void setRedirectUrl(Url redirectUrl_) { redirectUrl = redirectUrl_; } + const Url& getRedirectUrl() const { return redirectUrl; } + void setRedirect(const Dirent* target) { + info.r.redirectDirent = target; + mimeType = redirectMimeType; + } + article_index_t getRedirectIndex() const { return isRedirect() ? info.r.redirectDirent->getIdx() : article_index_t(0); } + + void setMimeType(uint16_t mime) + { + mimeType = mime; + } + + void setLinktarget() + { + ASSERT(mimeType, ==, 0); + mimeType = linktargetMimeType; + } + + void setDeleted() + { + ASSERT(mimeType, ==, 0); + mimeType = deletedMimeType; + } + + + void setIdx(article_index_t idx_) { idx = idx_; } + article_index_t getIdx() const { return idx; } + + + void setCluster(zim::writer::Cluster* _cluster) + { + ASSERT(isArticle(), ==, true); + cluster = _cluster; + info.d.blobNumber = _cluster->count(); + } + + cluster_index_t getClusterNumber() const { + return cluster ? cluster->getClusterIndex() : info.d.clusterNumber; + } + blob_index_t getBlobNumber() const { + return isRedirect() ? blob_index_t(0) : info.d.blobNumber; + } + + bool isRedirect() const { return mimeType == redirectMimeType; } + bool isLinktarget() const { return mimeType == linktargetMimeType; } + bool isDeleted() const { return mimeType == deletedMimeType; } + bool isArticle() const { return !isRedirect() && !isLinktarget() && !isDeleted(); } + uint16_t getMimeType() const { return mimeType; } + size_t getDirentSize() const + { + size_t ret = (isRedirect() ? 12 : 16) + url.getUrl().size() + 2; + if (title != url.getUrl()) + ret += title.size(); + return ret; + } + + offset_t getOffset() const { return offset; } + void setOffset(offset_t o) { offset = o; } + + void setArticle(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_) + { + ASSERT(mimeType, ==, 0); + mimeType = mimeType_; + info.d.clusterNumber = clusterNumber_; + info.d.blobNumber = blobNumber_; + } + + void write(int out_fd) const; + + friend bool compareUrl(const Dirent* d1, const Dirent* d2); + friend inline bool compareTitle(const Dirent* d1, const Dirent* d2); + }; + + + inline bool compareUrl(const Dirent* d1, const Dirent* d2) + { + return d1->url < d2->url; + } + inline bool compareTitle(const Dirent* d1, const Dirent* d2) + { + return d1->url.getNs() < d2->url.getNs() + || (d1->url.getNs() == d2->url.getNs() && d1->getTitle() < d2->getTitle()); + } + } +} + +#endif // ZIM_WRITER_DIRENT_H + diff --git a/src/writer/article.cpp b/src/writer/article.cpp new file mode 100644 index 0000000..bb62f34 --- /dev/null +++ b/src/writer/article.cpp @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +namespace zim +{ + namespace writer + { + bool Article::isLinktarget() const + { + return false; + } + + bool Article::isDeleted() const + { + return false; + } + + std::string Article::getNextCategory() + { + return std::string(); + } + + } +} diff --git a/src/writer/cluster.cpp b/src/writer/cluster.cpp new file mode 100644 index 0000000..d1828f6 --- /dev/null +++ b/src/writer/cluster.cpp @@ -0,0 +1,303 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "cluster.h" +#include "../log.h" +#include "../endian_tools.h" +#include "../debug.h" +#include "../compression.h" + +#include +#include + +#include +#include + +#ifdef _WIN32 +# include +#else +# include +# define _write(fd, addr, size) ::write((fd), (addr), (size)) +#endif + +namespace zim { +namespace writer { + +Cluster::Cluster(CompressionType compression) + : compression(compression), + isExtended(false), + _size(0) +{ + blobOffsets.push_back(offset_t(0)); + pthread_mutex_init(&m_closedMutex,NULL); +} + +Cluster::~Cluster() { + pthread_mutex_destroy(&m_closedMutex); + if (compressed_data.data()) { + delete[] compressed_data.data(); + } +} + +void Cluster::clear_data() { + clear_raw_data(); + clear_compressed_data(); +} + +void Cluster::clear_raw_data() { + Offsets().swap(blobOffsets); + ClusterData().swap(_data); +} + +void Cluster::clear_compressed_data() { + if (compressed_data.data()) { + delete[] compressed_data.data(); + compressed_data = Blob(); + } +} + +void Cluster::close() { + if (getCompression() != zim::zimcompDefault + && getCompression() != zim::zimcompNone) { + + // We must compress the content in a buffer. + compress(); + clear_raw_data(); + } + pthread_mutex_lock(&m_closedMutex); + closed = true; + pthread_mutex_unlock(&m_closedMutex); +} + +bool Cluster::isClosed() const{ + bool v; + pthread_mutex_lock(&m_closedMutex); + v = closed; + pthread_mutex_unlock(&m_closedMutex); + return v; +} + +zsize_t Cluster::size() const +{ + if (isClosed()) { + throw std::runtime_error("oups"); + } + if (isExtended) { + return zsize_t(blobOffsets.size() * sizeof(uint64_t)) + _size; + } else { + return zsize_t(blobOffsets.size() * sizeof(uint32_t)) + _size; + } +} + +template +void Cluster::write_offsets(writer_t writer) const +{ + size_type delta = blobOffsets.size() * sizeof(OFFSET_TYPE); + char out_buf[sizeof(OFFSET_TYPE)]; + for (auto offset : blobOffsets) + { + offset.v += delta; + toLittleEndian(static_cast(offset.v), out_buf); + writer(Blob(out_buf, sizeof(OFFSET_TYPE))); + } +} + +void Cluster::write_content(writer_t writer) const +{ + if (isExtended) { + write_offsets(writer); + } else { + write_offsets(writer); + } + write_data(writer); +} + +void Cluster::compress() +{ + auto comp = getCompression(); + switch(comp) { + case zim::zimcompBzip2: +#if !defined(ENABLE_ZLIB) + case zim::zimcompZip: +#endif +#if !defined(ENABLE_ZSTD) + case zim::zimcompZstd: +#endif + { + throw std::runtime_error("Compression method not enabled in this library"); + break; + } + + case zim::zimcompLzma: + { + _compress(); + break; + } + +#if defined(ENABLE_ZLIB) + case zim::zimcompZip: + { + _compress(); + break; + } +#endif + +#if defined(ENABLE_ZSTD) + case zim::zimcompZstd: + { + _compress(); + break; + } +#endif + + default: + throw std::runtime_error("We cannot compress an uncompressed cluster"); + }; +} + +template +void Cluster::_compress() +{ + Compressor runner; + bool first = true; + auto writer = [&](const Blob& data) -> void { + if (first) { + runner.init((char*)data.data()); + first = false; + } + runner.feed(data.data(), data.size()); + }; + write_content(writer); + zsize_t size; + auto comp = runner.get_data(&size); + compressed_data = Blob(comp.release(), size.v); +} + +void Cluster::write(int out_fd) const +{ + // write clusterInfo + char clusterInfo = 0; + if (isExtended) { + clusterInfo = 0x10; + } + clusterInfo += getCompression(); + if (_write(out_fd, &clusterInfo, 1) == -1) { + throw std::runtime_error("Error writng"); + } + + // Open a comprestion stream if needed + switch(getCompression()) + { + case zim::zimcompDefault: + case zim::zimcompNone: + { + auto writer = [=](const Blob& data) -> void { + // Ideally we would simply have to do : + // ::write(tmp_fd, data.c_str(), data.size()); + // However, the data can be pretty big (> 4Gb), especially with test, + // And ::write fails to write data > 4Gb. So we have to chunck the write. + size_type to_write = data.size(); + const char* src = data.data(); + while (to_write) { + size_type chunk_size = to_write > 4096 ? 4096 : to_write; + auto ret = _write(out_fd, src, chunk_size); + src += ret; + to_write -= ret; + } + }; + write_content(writer); + break; + } + + case zim::zimcompZip: + case zim::zimcompBzip2: + case zim::zimcompLzma: + case zim::zimcompZstd: + { + log_debug("compress data"); + if (_write(out_fd, compressed_data.data(), compressed_data.size()) == -1) { + throw std::runtime_error("Error writing"); + } + break; + } + + default: + std::ostringstream msg; + msg << "invalid compression flag " << getCompression(); + log_error(msg.str()); + throw std::runtime_error(msg.str()); + } +} + +void Cluster::addArticle(const zim::writer::Article* article) +{ + auto filename = article->getFilename(); + auto size = article->getSize(); + _size += size; + blobOffsets.push_back(offset_t(_size.v)); + isExtended |= (size>UINT32_MAX); + if (size == 0) + return; + + if (filename.empty()) { + _data.emplace_back(DataType::plain, article->getData()); + } + else { + _data.emplace_back(DataType::file, filename); + } +} + +void Cluster::addData(const char* data, zsize_t size) +{ + _size += size; + blobOffsets.push_back(offset_t(_size.v)); + isExtended |= (size.v>UINT32_MAX); + if (size.v == 0) + return; + + _data.emplace_back(DataType::plain, data, size.v); +} + +void Cluster::write_data(writer_t writer) const +{ + for (auto& data: _data) + { + ASSERT(data.value.empty(), ==, false); + if (data.type == DataType::plain) { + writer(Blob(data.value.c_str(), data.value.size())); + } else { + int fd = open(data.value.c_str(), O_RDONLY); + if (fd == -1) { + throw std::runtime_error(std::string("cannot open ") + data.value); + } + char* buffer = new char[1024*1024]; + while (true) { + auto r = read(fd, buffer, 1024*1024); + if (!r) + break; + writer(Blob(buffer, r)); + } + delete [] buffer; + ::close(fd); + } + } +} + +} // writer +} // zim diff --git a/src/writer/cluster.h b/src/writer/cluster.h new file mode 100644 index 0000000..9808e30 --- /dev/null +++ b/src/writer/cluster.h @@ -0,0 +1,111 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_CLUSTER_H_ +#define ZIM_WRITER_CLUSTER_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include "../zim_types.h" + +namespace zim { + +namespace writer { + +enum class DataType { plain, file }; +struct Data { + Data(zim::writer::DataType type, const std::string& value) : + type(type), value(value) {} + Data(zim::writer::DataType type, const char* data, zim::size_type size) : + type(type), value(data, size) {} + DataType type; + std::string value; +}; + +using writer_t = std::function; + +class Cluster { + typedef std::vector Offsets; + typedef std::vector ClusterData; + + + public: + Cluster(CompressionType compression); + virtual ~Cluster(); + + void setCompression(CompressionType c) { compression = c; } + CompressionType getCompression() const { return compression; } + + void addArticle(const zim::writer::Article* article); + void addData(const char* data, zsize_t size); + + blob_index_t count() const { return blob_index_t(blobOffsets.size() - 1); } + zsize_t size() const; + offset_t getOffset() const { return offset; } + void setOffset(offset_t o) { offset = o; } + bool is_extended() const { return isExtended; } + void clear_data(); + void close(); + bool isClosed() const; + + void setClusterIndex(cluster_index_t idx) { index = idx; } + cluster_index_t getClusterIndex() const { return index; } + + zsize_t getBlobSize(blob_index_t n) const + { return zsize_t(blobOffsets[blob_index_type(n)+1].v - blobOffsets[blob_index_type(n)].v); } + + void write(int out_fd) const; + + protected: + CompressionType compression; + cluster_index_t index; + bool isExtended; + Offsets blobOffsets; + offset_t offset; + zsize_t _size; + ClusterData _data; + mutable Blob compressed_data; + std::string tmp_filename; + mutable pthread_mutex_t m_closedMutex; + bool closed = false; + + private: + void write_content(writer_t writer) const; + template + void write_offsets(writer_t writer) const; + void write_data(writer_t writer) const; + void compress(); + template + void _compress(); + void clear_raw_data(); + void clear_compressed_data(); +}; + +}; + +}; + + +#endif //ZIM_WRITER_CLUSTER_H_ diff --git a/src/writer/creator.cpp b/src/writer/creator.cpp new file mode 100644 index 0000000..237ad5d --- /dev/null +++ b/src/writer/creator.cpp @@ -0,0 +1,641 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "config.h" + +#include "creatordata.h" +#include "cluster.h" +#include "debug.h" +#include "workers.h" +#include +#include +#include "../endian_tools.h" +#include +#include +#include "../md5.h" + +#if defined(ENABLE_XAPIAN) + #include "xapianIndexer.h" +#endif + +#ifdef _WIN32 +# include +#else +# include +# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \ +{throw std::runtime_error("Error writing");} +#endif + +#include +#include +#include +#include +#include +#include +#include +#include "log.h" +#include "../fs.h" +#include "../tools.h" + +log_define("zim.writer.creator") + +#define INFO(e) \ + do { \ + log_info(e); \ + std::cout << e << std::endl; \ + } while(false) + +#define TINFO(e) \ + if (verbose) { \ + double seconds = difftime(time(NULL), data->start_time); \ + std::cout << "T:" << (int)(seconds) \ + << "; " << e << std::endl; \ + } + +#define CLUSTER_BASE_OFFSET 1024 + +namespace zim +{ + namespace writer + { + Creator::Creator(bool verbose, CompressionType c) + : verbose(verbose) + , compression(c) + {} + + Creator::~Creator() = default; + + void Creator::startZimCreation(const std::string& fname) + { + data = std::unique_ptr(new CreatorData(fname, verbose, withIndex, indexingLanguage, compression)); + data->setMinChunkSize(minChunkSize); + + for(unsigned i=0; idata.get()); + data->workerThreads.push_back(thread); + } + + pthread_create(&data->writerThread, NULL, clusterWriter, this->data.get()); + } + + void Creator::addArticle(std::shared_ptr
article) + { + auto dirent = data->createDirentFromArticle(article.get()); + data->addDirent(dirent, article.get()); + data->nbArticles++; + if (article->isRedirect()) { + data->nbRedirectArticles++; + } else { + if (article->shouldCompress()) + data->nbCompArticles++; + else + data->nbUnCompArticles++; + if (!article->getFilename().empty()) + data->nbFileArticles++; + if (article->shouldIndex()) + data->nbIndexArticles++; + } + if (verbose && data->nbArticles%1000 == 0){ + double seconds = difftime(time(NULL),data->start_time); + std::cout << "T:" << (int)seconds + << "; A:" << data->nbArticles + << "; RA:" << data->nbRedirectArticles + << "; CA:" << data->nbCompArticles + << "; UA:" << data->nbUnCompArticles + << "; FA:" << data->nbFileArticles + << "; IA:" << data->nbIndexArticles + << "; C:" << data->nbClusters + << "; CC:" << data->nbCompClusters + << "; UC:" << data->nbUnCompClusters + << "; WC:" << data->taskList.size() + << std::endl; + } + +#if defined(ENABLE_XAPIAN) + if (article->shouldIndex()) { + data->titleIndexer.index(article.get()); + if(withIndex && !article->isRedirect()) { + data->taskList.pushToQueue(new IndexTask(article)); + } + } +#endif + } + + void Creator::finishZimCreation() + { + if (verbose) { + double seconds = difftime(time(NULL),data->start_time); + std::cout << "T:" << (int)seconds + << "; A:" << data->nbArticles + << "; RA:" << data->nbRedirectArticles + << "; CA:" << data->nbCompArticles + << "; UA:" << data->nbUnCompArticles + << "; FA:" << data->nbFileArticles + << "; IA:" << data->nbIndexArticles + << "; C:" << data->nbClusters + << "; CC:" << data->nbCompClusters + << "; UC:" << data->nbUnCompClusters + << "; WC:" << data->taskList.size() + << std::endl; + } + + // We need to wait that all indexation task has been done before closing the + // xapian database and add it to zim. + unsigned int wait = 0; + do { + microsleep(wait); + wait += 10; + } while(IndexTask::waiting_task.load() > 0); + +#if defined(ENABLE_XAPIAN) + { + data->titleIndexer.indexingPostlude(); + auto article = data->titleIndexer.getMetaArticle(); + auto dirent = data->createDirentFromArticle(article); + data->addDirent(dirent, article); + delete article; + } + if (withIndex) { + wait = 0; + do { + microsleep(wait); + wait += 10; + } while(IndexTask::waiting_task.load() > 0); + + data->indexer->indexingPostlude(); + microsleep(100); + auto article = data->indexer->getMetaArticle(); + auto dirent = data->createDirentFromArticle(article); + data->addDirent(dirent, article); + delete article; + } +#endif + + // When we've seen all articles, write any remaining clusters. + if (data->compCluster->count()) + data->closeCluster(true); + + if (data->uncompCluster->count()) + data->closeCluster(false); + + TINFO("Waiting for workers"); + // wait all cluster compression has been done + wait = 0; + do { + microsleep(wait); + wait += 10; + } while(ClusterTask::waiting_task.load() > 0); + + // Quit all workerThreads + for (auto i=0U; i< nbWorkerThreads; i++) { + data->taskList.pushToQueue(nullptr); + } + for(auto& thread: data->workerThreads) { + pthread_join(thread, nullptr); + } + + // Wait for writerThread to finish. + data->clusterToWrite.pushToQueue(nullptr); + pthread_join(data->writerThread, nullptr); + + TINFO("ResolveRedirectIndexes"); + data->resolveRedirectIndexes(); + + TINFO("Set article indexes"); + data->setArticleIndexes(); + + TINFO("Resolve mimetype"); + data->resolveMimeTypes(); + + TINFO("create title index"); + data->createTitleIndex(); + TINFO(data->dirents.size() << " title index created"); + TINFO(data->clustersList.size() << " clusters created"); + + TINFO("write zimfile :"); + write(); + ::close(data->out_fd); + + TINFO("rename tmpfile to final one."); + DEFAULTFS::rename(data->basename+".zim.tmp", data->basename+".zim"); + + TINFO("finish"); + } + + void Creator::fillHeader(Fileheader* header) const + { + auto mainUrl = getMainUrl(); + auto layoutUrl = getLayoutUrl(); + + if (data->isExtended) { + header->setMajorVersion(Fileheader::zimExtendedMajorVersion); + } else { + header->setMajorVersion(Fileheader::zimClassicMajorVersion); + } + header->setMinorVersion(Fileheader::zimMinorVersion); + header->setMainPage(std::numeric_limits::max()); + header->setLayoutPage(std::numeric_limits::max()); + + if (!mainUrl.empty() || !layoutUrl.empty()) + { + for (auto& dirent: data->dirents) + { + if (mainUrl == dirent->getFullUrl()) + { + header->setMainPage(article_index_type(dirent->getIdx())); + } + + if (layoutUrl == dirent->getFullUrl()) + { + header->setLayoutPage(article_index_type(dirent->getIdx())); + } + } + } + + header->setUuid( getUuid() ); + header->setArticleCount( data->dirents.size() ); + + header->setMimeListPos( Fileheader::size ); + + header->setClusterCount( data->clustersList.size() ); + } + + void Creator::write() const + { + Fileheader header; + fillHeader(&header); + + int out_fd = data->out_fd; + + lseek(out_fd, header.getMimeListPos(), SEEK_SET); + TINFO(" write mimetype list"); + for(auto& mimeType: data->mimeTypesList) + { + _write(out_fd, mimeType.c_str(), mimeType.size()+1); + } + + _write(out_fd, "", 1); + + ASSERT(lseek(out_fd, 0, SEEK_CUR), <, CLUSTER_BASE_OFFSET); + + TINFO(" write directory entries"); + lseek(out_fd, 0, SEEK_END); + for (Dirent* dirent: data->dirents) + { + dirent->setOffset(offset_t(lseek(out_fd, 0, SEEK_CUR))); + dirent->write(out_fd); + } + + TINFO(" write url prt list"); + header.setUrlPtrPos(lseek(out_fd, 0, SEEK_CUR)); + for (auto& dirent: data->dirents) + { + char tmp_buff[sizeof(offset_type)]; + toLittleEndian(dirent->getOffset(), tmp_buff); + _write(out_fd, tmp_buff, sizeof(offset_type)); + } + + TINFO(" write title index"); + header.setTitleIdxPos(lseek(out_fd, 0, SEEK_CUR)); + for (Dirent* dirent: data->titleIdx) + { + char tmp_buff[sizeof(article_index_type)]; + toLittleEndian(dirent->getIdx().v, tmp_buff); + _write(out_fd, tmp_buff, sizeof(article_index_type)); + } + + TINFO(" write cluster offset list"); + header.setClusterPtrPos(lseek(out_fd, 0, SEEK_CUR)); + for (auto cluster : data->clustersList) + { + char tmp_buff[sizeof(offset_type)]; + toLittleEndian(cluster->getOffset(), tmp_buff); + _write(out_fd, tmp_buff, sizeof(offset_type)); + } + + header.setChecksumPos(lseek(out_fd, 0, SEEK_CUR)); + + TINFO(" write header"); + lseek(out_fd, 0, SEEK_SET); + header.write(out_fd); + + TINFO(" write checksum"); + struct zim_MD5_CTX md5ctx; + unsigned char batch_read[1024+1]; + lseek(out_fd, 0, SEEK_SET); + zim_MD5Init(&md5ctx); + while (true) { + auto r = read(out_fd, batch_read, 1024); + if (r == -1) { + perror("Cannot read"); + throw std::runtime_error("oups"); + } + if (r == 0) + break; + batch_read[r] = 0; + zim_MD5Update(&md5ctx, batch_read, r); + } + unsigned char digest[16]; + zim_MD5Final(digest, &md5ctx); + _write(out_fd, reinterpret_cast(digest), 16); + } + + CreatorData::CreatorData(const std::string& fname, + bool verbose, + bool withIndex, + std::string language, + CompressionType c) + : compression(c), + withIndex(withIndex), + indexingLanguage(language), +#if defined(ENABLE_XAPIAN) + titleIndexer(language, IndexingMode::TITLE, true), +#endif + verbose(verbose), + nbArticles(0), + nbRedirectArticles(0), + nbCompArticles(0), + nbUnCompArticles(0), + nbFileArticles(0), + nbIndexArticles(0), + nbClusters(0), + nbCompClusters(0), + nbUnCompClusters(0), + start_time(time(NULL)) + { + basename = (fname.size() > 4 && fname.compare(fname.size() - 4, 4, ".zim") == 0) + ? fname.substr(0, fname.size() - 4) + : fname; + auto zim_name = basename + ".zim.tmp"; +#ifdef _WIN32 +int mode = _S_IREAD | _S_IWRITE; +#else + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; +#endif + out_fd = open(zim_name.c_str(), O_RDWR|O_CREAT|O_TRUNC, mode); + if (out_fd == -1){ + perror(nullptr); + std::ostringstream ss; + ss << "Cannot create file " << zim_name; + throw std::runtime_error(ss.str()); + } + if(lseek(out_fd, CLUSTER_BASE_OFFSET, SEEK_SET) != CLUSTER_BASE_OFFSET) { + close(out_fd); + perror(nullptr); + throw std::runtime_error("Impossible to seek in file"); + } + + // We keep both a "compressed cluster" and an "uncompressed cluster" + // because we don't know which one will fill up first. We also need + // to track the dirents currently in each, so we can fix up the + // cluster index if the other one ends up written first. + compCluster = new Cluster(compression); + uncompCluster = new Cluster(zimcompNone); + +#if defined(ENABLE_XAPIAN) + titleIndexer.indexingPrelude(basename+"_title.idx"); + if (withIndex) { + indexer = new XapianIndexer(indexingLanguage, IndexingMode::FULL, true); + indexer->indexingPrelude(basename+".idx"); + } +#endif + } + + CreatorData::~CreatorData() + { + if (compCluster) + delete compCluster; + if (uncompCluster) + delete uncompCluster; + for(auto& cluster: clustersList) { + delete cluster; + } +#if defined(ENABLE_XAPIAN) + if (indexer) + delete indexer; +#endif + } + + void CreatorData::addDirent(Dirent* dirent, const Article* article) + { + auto ret = dirents.insert(dirent); + if (!ret.second) { + Dirent* existing = *ret.first; + if (existing->isRedirect() && !dirent->isRedirect()) { + unresolvedRedirectDirents.erase(existing); + dirents.erase(ret.first); + dirents.insert(dirent); + } else { + std::cerr << "Impossible to add " << dirent->getFullUrl().getLongUrl() << std::endl; + std::cerr << " dirent's title to add is : " << dirent->getTitle() << std::endl; + std::cerr << " existing dirent's title is : " << existing->getTitle() << std::endl; + return; + } + }; + + // If this is a redirect, we're done: there's no blob to add. + if (dirent->isRedirect()) + { + unresolvedRedirectDirents.insert(dirent); + return; + } + + // Add blob data to compressed or uncompressed cluster. + auto articleSize = article->getSize(); + if (articleSize > 0) + { + isEmpty = false; + } + + Cluster *cluster; + if (article->shouldCompress()) + { + cluster = compCluster; + } + else + { + cluster = uncompCluster; + } + + // If cluster will be too large, write it to dis, and open a new + // one for the content. + if ( cluster->count() + && cluster->size().v+articleSize >= minChunkSize * 1024 + ) + { + log_info("cluster with " << cluster->count() << " articles, " << + cluster->size() << " bytes; current title \"" << + dirent->getTitle() << '\"'); + cluster = closeCluster(article->shouldCompress()); + } + + dirent->setCluster(cluster); + cluster->addArticle(article); + } + + Dirent* CreatorData::createDirentFromArticle(const Article* article) + { + auto dirent = pool.getDirent(); + dirent->setUrl(article->getUrl()); + dirent->setTitle(article->getTitle()); + + if (article->isRedirect()) + { + dirent->setRedirect(nullptr); + dirent->setRedirectUrl(article->getRedirectUrl()); + } + else if (article->isLinktarget()) + { + dirent->setLinktarget(); + } + else if (article->isDeleted()) + { + dirent->setDeleted(); + } + else + { + auto mimetype = article->getMimeType(); + if (mimetype.empty()) { + std::cerr << "Warning, " << article->getUrl().getLongUrl() << " have empty mimetype." << std::endl; + mimetype = "application/octet-stream"; + } + dirent->setMimeType(getMimeTypeIdx(mimetype)); + } + return dirent; + } + + Cluster* CreatorData::closeCluster(bool compressed) + { + Cluster *cluster; + nbClusters++; + if (compressed ) + { + cluster = compCluster; + nbCompClusters++; + } else { + cluster = uncompCluster; + nbUnCompClusters++; + } + cluster->setClusterIndex(cluster_index_t(clustersList.size())); + clustersList.push_back(cluster); + taskList.pushToQueue(new ClusterTask(cluster)); + clusterToWrite.pushToQueue(cluster); + + if (cluster->is_extended() ) + isExtended = true; + if (compressed) + { + cluster = compCluster = new Cluster(compression); + } else { + cluster = uncompCluster = new Cluster(zimcompNone); + } + return cluster; + } + + void CreatorData::setArticleIndexes() + { + // set index + INFO("set index"); + article_index_t idx(0); + for (auto& dirent: dirents) { + dirent->setIdx(idx); + idx += 1; + } + } + + void CreatorData::resolveRedirectIndexes() + { + // translate redirect aid to index + INFO("Resolve redirect"); + for (auto dirent: unresolvedRedirectDirents) + { + Dirent tmpDirent(dirent->getRedirectUrl()); + auto target_pos = dirents.find(&tmpDirent); + if(target_pos == dirents.end()) { + INFO("Invalid redirection " << dirent->getFullUrl().getLongUrl() << " redirecting to (missing) " << dirent->getRedirectUrl().getLongUrl()); + dirents.erase(dirent); + } else { + dirent->setRedirect(*target_pos); + } + } + } + + void CreatorData::createTitleIndex() + { + titleIdx.clear(); + for (auto dirent: dirents) + titleIdx.insert(dirent); + } + + void CreatorData::resolveMimeTypes() + { + std::vector oldMImeList; + std::vector mapping; + + for (auto& rmimeType: rmimeTypesMap) + { + oldMImeList.push_back(rmimeType.second); + mimeTypesList.push_back(rmimeType.second); + } + + mapping.resize(oldMImeList.size()); + std::sort(mimeTypesList.begin(), mimeTypesList.end()); + + for (unsigned i=0; i(j); + } + } + + for (auto& dirent: dirents) + { + if (dirent->isArticle()) + dirent->setMimeType(mapping[dirent->getMimeType()]); + } + } + + uint16_t CreatorData::getMimeTypeIdx(const std::string& mimeType) + { + auto it = mimeTypesMap.find(mimeType); + if (it == mimeTypesMap.end()) + { + if (nextMimeIdx >= std::numeric_limits::max()) + throw std::runtime_error("too many distinct mime types"); + mimeTypesMap[mimeType] = nextMimeIdx; + rmimeTypesMap[nextMimeIdx] = mimeType; + return nextMimeIdx++; + } + + return it->second; + } + + const std::string& CreatorData::getMimeType(uint16_t mimeTypeIdx) const + { + auto it = rmimeTypesMap.find(mimeTypeIdx); + if (it == rmimeTypesMap.end()) + throw std::runtime_error("mime type index not found"); + return it->second; + } + } +} diff --git a/src/writer/creatordata.h b/src/writer/creatordata.h new file mode 100644 index 0000000..a1b083c --- /dev/null +++ b/src/writer/creatordata.h @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_CREATOR_DATA_H +#define ZIM_WRITER_CREATOR_DATA_H + +#include +#include +#include "queue.h" +#include "_dirent.h" +#include "workers.h" +#include "xapianIndexer.h" +#include +#include +#include +#include "config.h" + +#include "direntPool.h" + +#if defined(ENABLE_XAPIAN) + class XapianIndexer; +#endif + +namespace zim +{ + namespace writer + { + struct UrlCompare { + bool operator() (const Dirent* d1, const Dirent* d2) const { + return compareUrl(d1, d2); + } + }; + + struct TitleCompare { + bool operator() (const Dirent* d1, const Dirent* d2) const { + return compareTitle(d1, d2); + } + }; + + + class Cluster; + class CreatorData + { + public: + typedef std::set UrlSortedDirents; + typedef std::multiset TitleSortedDirents; + typedef std::map MimeTypesMap; + typedef std::map RMimeTypesMap; + typedef std::vector MimeTypesList; + typedef std::vector ClusterList; + typedef Queue ClusterQueue; + typedef Queue TaskQueue; + typedef std::vector ThreadList; + + CreatorData(const std::string& fname, bool verbose, + bool withIndex, std::string language, + CompressionType compression); + virtual ~CreatorData(); + + void addDirent(Dirent* dirent, const Article* article); + Dirent* createDirentFromArticle(const Article* article); + Cluster* closeCluster(bool compressed); + + void setArticleIndexes(); + void resolveRedirectIndexes(); + void createTitleIndex(); + void resolveMimeTypes(); + + uint16_t getMimeTypeIdx(const std::string& mimeType); + const std::string& getMimeType(uint16_t mimeTypeIdx) const; + + size_t minChunkSize = 1024-64; + + DirentPool pool; + + UrlSortedDirents dirents; + UrlSortedDirents unresolvedRedirectDirents; + TitleSortedDirents titleIdx; + + MimeTypesMap mimeTypesMap; + RMimeTypesMap rmimeTypesMap; + MimeTypesList mimeTypesList; + uint16_t nextMimeIdx = 0; + + ClusterList clustersList; + ClusterQueue clusterToWrite; + TaskQueue taskList; + ThreadList workerThreads; + pthread_t writerThread; + const CompressionType compression; + std::string basename; + bool isEmpty = true; + bool isExtended = false; + zsize_t clustersSize; + Cluster *compCluster = nullptr; + Cluster *uncompCluster = nullptr; + int out_fd; + + bool withIndex; + std::string indexingLanguage; +#if defined(ENABLE_XAPIAN) + XapianIndexer titleIndexer; + XapianIndexer* indexer = nullptr; +#endif + + // Some stats + bool verbose; + article_index_type nbArticles; + article_index_type nbRedirectArticles; + article_index_type nbCompArticles; + article_index_type nbUnCompArticles; + article_index_type nbFileArticles; + article_index_type nbIndexArticles; + cluster_index_type nbClusters; + cluster_index_type nbCompClusters; + cluster_index_type nbUnCompClusters; + time_t start_time; + + cluster_index_t clusterCount() const + { return cluster_index_t(clustersList.size()); } + + article_index_t articleCount() const + { return article_index_t(dirents.size()); } + + size_t getMinChunkSize() { return minChunkSize; } + void setMinChunkSize(size_t s) { minChunkSize = s; } + }; + + } + +} + +#endif // ZIM_WRITER_CREATOR_DATA_H diff --git a/src/writer/dirent.cpp b/src/writer/dirent.cpp new file mode 100644 index 0000000..d13485c --- /dev/null +++ b/src/writer/dirent.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "_dirent.h" +#include +#include "buffer.h" +#include "endian_tools.h" +#include "log.h" +#include +#include +#ifdef _WIN32 +# include +#else +# include +# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \ +{throw std::runtime_error("Error writing");} +#endif + +log_define("zim.dirent") + +void zim::writer::Dirent::write(int out_fd) const +{ + union + { + char d[16]; + long a; + } header; + zim::toLittleEndian(getMimeType(), header.d); + header.d[2] = 0; // parameter size + header.d[3] = getNamespace(); + + log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size()); + + zim::toLittleEndian(getVersion(), header.d + 4); + + if (isRedirect()) + { + zim::toLittleEndian(getRedirectIndex().v, header.d + 8); + _write(out_fd, header.d, 12); + } + else if (isLinktarget() || isDeleted()) + { + _write(out_fd, header.d, 8); + } + else + { + zim::toLittleEndian(zim::cluster_index_type(getClusterNumber()), header.d + 8); + zim::toLittleEndian(zim::blob_index_type(getBlobNumber()), header.d + 12); + _write(out_fd, header.d, 16); + } + + auto& url = getUrl(); + _write(out_fd, url.c_str(), url.size()+1); + + std::string t = getTitle(); + if (t != getUrl()) + _write(out_fd, t.c_str(), t.size()); + char c = 0; + _write(out_fd, &c, 1); + +} diff --git a/src/writer/direntPool.h b/src/writer/direntPool.h new file mode 100644 index 0000000..bc17da0 --- /dev/null +++ b/src/writer/direntPool.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2019 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_DIRENTPOOL_H +#define ZIM_WRITER_DIRENTPOOL_H + +#include "debug.h" +#include "_dirent.h" + +namespace zim +{ + namespace writer { + class DirentPool { + private: + std::vector pools; + uint16_t direntIndex; + + void allocate_new_pool() { + pools.push_back(new Dirent[0xFFFF]); + direntIndex = 0; + } + + public: + DirentPool() : + direntIndex(0xFFFF) + {} + ~DirentPool() { + for(auto direntArray: pools) { + delete[] direntArray; + } + } + + Dirent* getDirent() { + if (direntIndex == 0xFFFF) { + allocate_new_pool(); + } + return pools.back() + direntIndex++; + } + }; + } +} + +#endif // ZIM_WRITER_DIRENTPOLL_H + diff --git a/src/writer/queue.h b/src/writer/queue.h new file mode 100644 index 0000000..c191bbf --- /dev/null +++ b/src/writer/queue.h @@ -0,0 +1,111 @@ +/* + * Copyright 2016 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_QUEUE_H +#define OPENZIM_LIBZIM_QUEUE_H + +#define MAX_QUEUE_SIZE 10 + +#include +#include +#include "../tools.h" + +template +class Queue { + public: + Queue() {pthread_mutex_init(&m_queueMutex,NULL);}; + virtual ~Queue() {pthread_mutex_destroy(&m_queueMutex);}; + virtual bool isEmpty(); + virtual size_t size(); + virtual void pushToQueue(const T& element); + virtual bool getHead(T &element); + virtual bool popFromQueue(T &element); + + protected: + std::queue m_realQueue; + pthread_mutex_t m_queueMutex; + + private: + // Make this queue non copyable + Queue(const Queue&); + Queue& operator=(const Queue&); +}; + +template +bool Queue::isEmpty() { + pthread_mutex_lock(&m_queueMutex); + bool retVal = m_realQueue.empty(); + pthread_mutex_unlock(&m_queueMutex); + return retVal; +} + +template +size_t Queue::size() { + pthread_mutex_lock(&m_queueMutex); + size_t retVal = m_realQueue.size(); + pthread_mutex_unlock(&m_queueMutex); + return retVal; +} + +template +void Queue::pushToQueue(const T &element) { + unsigned int wait = 0; + unsigned int queueSize = 0; + + do { + zim::microsleep(wait); + pthread_mutex_lock(&m_queueMutex); + queueSize = m_realQueue.size(); + pthread_mutex_unlock(&m_queueMutex); + wait += 10; + } while (queueSize > MAX_QUEUE_SIZE); + + pthread_mutex_lock(&m_queueMutex); + m_realQueue.push(element); + pthread_mutex_unlock(&m_queueMutex); +} + +template +bool Queue::getHead(T &element) { + pthread_mutex_lock(&m_queueMutex); + if (m_realQueue.empty()) { + pthread_mutex_unlock(&m_queueMutex); + return false; + } + element = m_realQueue.front(); + pthread_mutex_unlock(&m_queueMutex); + return true; +} + +template +bool Queue::popFromQueue(T &element) { + pthread_mutex_lock(&m_queueMutex); + if (m_realQueue.empty()) { + pthread_mutex_unlock(&m_queueMutex); + return false; + } + + element = m_realQueue.front(); + m_realQueue.pop(); + pthread_mutex_unlock(&m_queueMutex); + + return true; +} + +#endif // OPENZIM_LIBZIM_QUEUE_H diff --git a/src/writer/workers.cpp b/src/writer/workers.cpp new file mode 100644 index 0000000..dd8821d --- /dev/null +++ b/src/writer/workers.cpp @@ -0,0 +1,192 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "config.h" + +#include "creatordata.h" +#include "cluster.h" +#include "debug.h" +#include +#include "../endian_tools.h" +#include +#include + +#if defined(ENABLE_XAPIAN) + #include "xapianIndexer.h" +#endif + +#ifdef _WIN32 +#include +#else +#include +#endif + +#include +#include +#include +#include +#include "log.h" +#include "../fs.h" +#include "../tools.h" + +static pthread_mutex_t s_dbaccessLock = PTHREAD_MUTEX_INITIALIZER; +std::atomic zim::writer::ClusterTask::waiting_task(0); +std::atomic zim::writer::IndexTask::waiting_task(0); + +namespace zim +{ + namespace writer + { + + inline unsigned int countWords(const string& text) + { + unsigned int numWords = 1; + unsigned int length = text.size(); + + for (unsigned int i = 0; i < length;) { + while (i < length && text[i] != ' ') { + i++; + } + numWords++; + i++; + } + return numWords; + } + + const unsigned int keywordsBoostFactor = 3; + inline unsigned int getTitleBoostFactor(const unsigned int contentLength) + { + return contentLength / 500 + 1; + } + + + void ClusterTask::run(CreatorData* data) { + cluster->close(); + }; + + void IndexTask::run(CreatorData* data) { + Xapian::Stem stemmer; + Xapian::TermGenerator indexer; + try { + stemmer = Xapian::Stem(data->indexer->stemmer_language); + indexer.set_stemmer(stemmer); + indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); + } catch (...) { + // No stemming for language. + } + indexer.set_stopper(&data->indexer->stopper); + indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); + + zim::MyHtmlParser htmlParser; + try { + htmlParser.parse_html(p_article->getData(), "UTF-8", true); + } catch (...) {} + if (htmlParser.dump.find("NOINDEX") != string::npos) + { + return; + } + + Xapian::Document document; + document.set_data(p_article->getUrl().getLongUrl()); + indexer.set_document(document); + + auto title = p_article->getTitle(); + auto normalizedTitle = zim::removeAccents(title); + auto keywords = zim::removeAccents(htmlParser.keywords); + auto content = zim::removeAccents(htmlParser.dump); + + document.add_value(0, title); + + std::stringstream countWordStringStream; + countWordStringStream << countWords(htmlParser.dump); + document.add_value(1, countWordStringStream.str()); + + if (htmlParser.has_geoPosition) { + auto geoPosition = Xapian::LatLongCoord( + htmlParser.latitude, htmlParser.longitude).serialise(); + document.add_value(2, geoPosition); + } + + /* Index the title */ + if (!normalizedTitle.empty()) { + indexer.index_text_without_positions( + normalizedTitle, getTitleBoostFactor(content.size())); + } + + /* Index the keywords */ + if (!keywords.empty()) { + indexer.index_text_without_positions(keywords, keywordsBoostFactor); + } + + /* Index the content */ + if (!content.empty()) { + indexer.index_text_without_positions(content); + } + + pthread_mutex_lock(&s_dbaccessLock); + data->indexer->writableDatabase.add_document(document); + pthread_mutex_unlock(&s_dbaccessLock); + } + + void* taskRunner(void* arg) { + auto creatorData = static_cast(arg); + Task* task; + unsigned int wait = 0; + + while(true) { + microsleep(wait); + wait += 100; + if (creatorData->taskList.popFromQueue(task)) { + if (task == nullptr) { + return nullptr; + } + task->run(creatorData); + delete task; + wait = 0; + } + } + return nullptr; + } + + void* clusterWriter(void* arg) { + auto creatorData = static_cast(arg); + Cluster* cluster; + unsigned int wait = 0; + while(true) { + microsleep(wait); + wait += 100; + if(creatorData->clusterToWrite.getHead(cluster)) { + if (cluster == nullptr) { + // All cluster writen, we can quit + return nullptr; + } + if (not cluster->isClosed()) { + continue; + } + creatorData->clusterToWrite.popFromQueue(cluster); + cluster->setOffset(offset_t(lseek(creatorData->out_fd, 0, SEEK_CUR))); + cluster->write(creatorData->out_fd); + cluster->clear_data(); + wait = 0; + } + } + return nullptr; + } + } +} diff --git a/src/writer/workers.h b/src/writer/workers.h new file mode 100644 index 0000000..9df11b0 --- /dev/null +++ b/src/writer/workers.h @@ -0,0 +1,83 @@ +/* + * Copyright 2016 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_WORKER_H +#define OPENZIM_LIBZIM_WORKER_H + +#include + +namespace zim { +namespace writer { + +class Cluster; +class CreatorData; + +class Task { + public: + Task() = default; + virtual ~Task() = default; + + virtual void run(CreatorData* data) = 0; +}; + +class ClusterTask : public Task { + public: + ClusterTask(Cluster* cluster) : + cluster(cluster) + { + ++waiting_task; + }; + virtual ~ClusterTask() + { + --waiting_task; + } + + virtual void run(CreatorData* data); + static std::atomic waiting_task; + + private: + Cluster* cluster; +}; + +class IndexTask : public Task { + public: + IndexTask(std::shared_ptr
article) : + p_article(article) + { + ++waiting_task; + } + virtual ~IndexTask() + { + --waiting_task; + } + + virtual void run(CreatorData* data); + static std::atomic waiting_task; + + private: + std::shared_ptr
p_article; +}; + +void* taskRunner(void* data); +void* clusterWriter(void* data); + +} +} + +#endif // OPENZIM_LIBZIM_QUEUE_H diff --git a/src/writer/xapianIndexer.cpp b/src/writer/xapianIndexer.cpp new file mode 100644 index 0000000..c56b8ee --- /dev/null +++ b/src/writer/xapianIndexer.cpp @@ -0,0 +1,164 @@ +/* + * Copyright 2011 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "xapianIndexer.h" +#include "libzim-resources.h" +#include "fs.h" +#include "tools.h" +#include +#include +#include + +/* Constructor */ +XapianIndexer::XapianIndexer(const std::string& language, IndexingMode indexingMode, const bool verbose) + : language(language), + indexingMode(indexingMode) +{ + /* Build ICU Local object to retrieve ISO-639 language code (from + ISO-639-3) */ + icu::Locale languageLocale(language.c_str()); + stemmer_language = languageLocale.getLanguage(); + + /* Read the stopwords */ + std::string stopWord; + try { + this->stopwords = getResource("stopwords/" + language); + } catch(ResourceNotFound& e) {} + std::istringstream file(this->stopwords); + while (std::getline(file, stopWord, '\n')) { + this->stopper.add(stopWord); + } +} + +XapianIndexer::~XapianIndexer() +{ + if (!indexPath.empty()) { + try { +#ifndef _WIN32 +//[TODO] Implement remove for windows + zim::DEFAULTFS::remove(indexPath + ".tmp"); + zim::DEFAULTFS::remove(indexPath); +#endif + } catch (...) { + /* Do not raise */ + } + } +} + +void XapianIndexer::indexingPrelude(const string indexPath_) +{ + indexPath = indexPath_; + writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE); + switch (indexingMode) { + case IndexingMode::TITLE: + writableDatabase.set_metadata("valuesmap", "title:0"); + writableDatabase.set_metadata("kind", "title"); + break; + case IndexingMode::FULL: + writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1;geo.position:2"); + writableDatabase.set_metadata("kind", "fulltext"); + break; + } + writableDatabase.set_metadata("language", language); + writableDatabase.set_metadata("stopwords", stopwords); + writableDatabase.begin_transaction(true); +} + +void XapianIndexer::index(const zim::writer::Article* article) +{ + switch (indexingMode) { + case IndexingMode::TITLE: + indexTitle(article); + break; + case IndexingMode::FULL: + indexFull(article); + break; + } +} + + +void XapianIndexer::indexFull(const zim::writer::Article* article) +{ +} + +void XapianIndexer::indexTitle(const zim::writer::Article* article) +{ + Xapian::Stem stemmer; + Xapian::TermGenerator indexer; + try { + stemmer = Xapian::Stem(stemmer_language); + indexer.set_stemmer(stemmer); + indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); + } catch (...) {} + indexer.set_stopper(&stopper); + indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); + Xapian::Document currentDocument; + currentDocument.clear_values(); + currentDocument.set_data(article->getUrl().getLongUrl()); + indexer.set_document(currentDocument); + + std::string accentedTitle = article->getTitle(); + std::string title = zim::removeAccents(accentedTitle); + + currentDocument.add_value(0, accentedTitle); + + if (!title.empty()) { + indexer.index_text(title, 1); + } + + /* add to the database */ + writableDatabase.add_document(currentDocument); +} + +void XapianIndexer::flush() +{ + this->writableDatabase.commit_transaction(); + this->writableDatabase.begin_transaction(true); +} + +void XapianIndexer::indexingPostlude() +{ + this->flush(); + this->writableDatabase.commit_transaction(); + this->writableDatabase.commit(); + this->writableDatabase.compact(indexPath, Xapian::DBCOMPACT_SINGLE_FILE); + this->writableDatabase.close(); +} + +XapianMetaArticle* XapianIndexer::getMetaArticle() +{ + return new XapianMetaArticle(this, indexingMode); +} + +zim::size_type XapianMetaArticle::getSize() const +{ + std::ifstream in(indexer->getIndexPath(), std::ios::binary|std::ios::ate); + return in.tellg(); +} + +std::string XapianMetaArticle::getFilename() const +{ + return indexer->getIndexPath(); +} + +zim::Blob XapianMetaArticle::getData() const +{ + throw std::logic_error("We should not pass here."); + return zim::Blob(); +} diff --git a/src/writer/xapianIndexer.h b/src/writer/xapianIndexer.h new file mode 100644 index 0000000..c934a9c --- /dev/null +++ b/src/writer/xapianIndexer.h @@ -0,0 +1,110 @@ +/* + * Copyright 2011 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef LIBZIM_WRITER_XAPIANINDEXER_H +#define LIBZIM_WRITER_XAPIANINDEXER_H + +#include +#include + +#include +#include +#include +#include "xapian/myhtmlparse.h" + + +namespace zim { + namespace writer { + class IndexTask; + } +} +class XapianIndexer; + +enum class IndexingMode { + TITLE, + FULL +}; + +class XapianMetaArticle : public zim::writer::Article +{ + private: + XapianIndexer* indexer; + IndexingMode mode; + mutable std::string data; + + public: + XapianMetaArticle(XapianIndexer* indexer, IndexingMode mode) : indexer(indexer), mode(mode) + {} + virtual ~XapianMetaArticle() = default; + virtual zim::Blob getData() const; + virtual zim::writer::Url getUrl() const { + switch (mode) { + case IndexingMode::FULL: + return zim::writer::Url('X', "fulltext/xapian"); + case IndexingMode::TITLE: + return zim::writer::Url('X', "title/xapian"); + } + return zim::writer::Url(); + } + virtual std::string getTitle() const { + switch (mode) { + case IndexingMode::FULL: + return "Xapian Fulltext Index"; + case IndexingMode::TITLE: + return "Xapian Title Index"; + } + return ""; + } + virtual std::string getMimeType() const { return "application/octet-stream+xapian"; } + virtual bool isRedirect() const { return false; } + virtual bool shouldIndex() const { return false; } + virtual bool shouldCompress() const { return false; } + virtual zim::writer::Url getRedirectUrl() const { return zim::writer::Url(); } + virtual zim::size_type getSize() const; + virtual std::string getFilename() const; +}; + +class XapianIndexer +{ + public: + XapianIndexer(const std::string& language, IndexingMode mode, bool verbose); + virtual ~XapianIndexer(); + std::string getIndexPath() { return indexPath; } + void indexingPrelude(const string indexPath); + void index(const zim::writer::Article* article); + void flush(); + void indexingPostlude(); + XapianMetaArticle* getMetaArticle(); + + protected: + void indexTitle(const zim::writer::Article* article); + void indexFull(const zim::writer::Article* article); + + Xapian::WritableDatabase writableDatabase; + std::string stemmer_language; + Xapian::SimpleStopper stopper; + std::string indexPath; + std::string language; + std::string stopwords; + IndexingMode indexingMode; + + friend class zim::writer::IndexTask; +}; + +#endif // LIBZIM_WRITER_XAPIANINDEXER_H diff --git a/src/xapian/htmlparse.cc b/src/xapian/htmlparse.cc new file mode 100644 index 0000000..0f3316d --- /dev/null +++ b/src/xapian/htmlparse.cc @@ -0,0 +1,377 @@ +/* htmlparse.cc: simple HTML parser for omega indexer + * + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2001 Ananova Ltd + * Copyright 2002,2006,2007,2008 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +// #include + +#include "htmlparse.h" + +#include + +// #include "utf8convert.h" + +#include + +#include +#include +#include +#include +#include + +using namespace std; + +inline void +lowercase_string(string &str) +{ + for (string::iterator i = str.begin(); i != str.end(); ++i) { + *i = tolower(static_cast(*i)); + } +} + +map zim::HtmlParser::named_ents; +static pthread_mutex_t sInitLock = PTHREAD_MUTEX_INITIALIZER; + +inline static bool +p_notdigit(char c) +{ + return !isdigit(static_cast(c)); +} + +inline static bool +p_notxdigit(char c) +{ + return !isxdigit(static_cast(c)); +} + +inline static bool +p_notalnum(char c) +{ + return !isalnum(static_cast(c)); +} + +inline static bool +p_notwhitespace(char c) +{ + return !isspace(static_cast(c)); +} + +inline static bool +p_nottag(char c) +{ + return !isalnum(static_cast(c)) && + c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. +} + +inline static bool +p_whitespacegt(char c) +{ + return isspace(static_cast(c)) || c == '>'; +} + +inline static bool +p_whitespaceeqgt(char c) +{ + return isspace(static_cast(c)) || c == '=' || c == '>'; +} + +bool +zim::HtmlParser::get_parameter(const string & param, string & value) +{ + map::const_iterator i = parameters.find(param); + if (i == parameters.end()) return false; + value = i->second; + return true; +} + +zim::HtmlParser::HtmlParser() +{ + static const struct ent { const char *n; unsigned int v; } ents[] = { +#include "namedentities.h" + { NULL, 0 } + }; + pthread_mutex_lock(&sInitLock); + if (named_ents.empty()) { + const struct ent *i = ents; + while (i->n) { + named_ents[string(i->n)] = i->v; + ++i; + } + } + pthread_mutex_unlock(&sInitLock); +} + +void +zim::HtmlParser::decode_entities(string &s) +{ + // We need a const_iterator version of s.end() - otherwise the + // find() and find_if() templates don't work... + string::const_iterator amp = s.begin(), s_end = s.end(); + while ((amp = find(amp, s_end, '&')) != s_end) { + unsigned int val = 0; + string::const_iterator end, p = amp + 1; + if (p != s_end && *p == '#') { + p++; + if (p != s_end && (*p == 'x' || *p == 'X')) { + // hex + p++; + end = find_if(p, s_end, p_notxdigit); + sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); + } else { + // number + end = find_if(p, s_end, p_notdigit); + val = atoi(s.substr(p - s.begin(), end - p).c_str()); + } + } else { + end = find_if(p, s_end, p_notalnum); + string code = s.substr(p - s.begin(), end - p); + map::const_iterator i; + i = named_ents.find(code); + if (i != named_ents.end()) val = i->second; + } + if (end < s_end && *end == ';') end++; + if (val) { + string::size_type amp_pos = amp - s.begin(); + if (val < 0x80) { + s.replace(amp_pos, end - amp, 1u, char(val)); + } else { + // Convert unicode value val to UTF-8. + char seq[4]; + unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); + s.replace(amp_pos, end - amp, seq, len); + } + s_end = s.end(); + // We've modified the string, so the iterators are no longer + // valid... + amp = s.begin() + amp_pos + 1; + } else { + amp = end; + } + } +} + +void +zim::HtmlParser::parse_html(const string &body) +{ + in_script = false; + + parameters.clear(); + string::const_iterator start = body.begin(); + + while (true) { + // Skip through until we find an HTML tag, a comment, or the end of + // document. Ignore isolated occurrences of `<' which don't start + // a tag or comment. + string::const_iterator p = start; + while (true) { + p = find(p, body.end(), '<'); + if (p == body.end()) break; + unsigned char ch = *(p + 1); + + // Tag, closing tag, or comment (or SGML declaration). + if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; + + if (ch == '?') { + // PHP code or XML declaration. + // XML declaration is only valid at the start of the first line. + // FIXME: need to deal with BOMs... + if (p != body.begin() || body.size() < 20) break; + + // XML declaration looks something like this: + // + if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; + if (strchr(" \t\r\n", p[5]) == NULL) break; + + string::const_iterator decl_end = find(p + 6, body.end(), '?'); + if (decl_end == body.end()) break; + + // Default charset for XML is UTF-8. + charset = "UTF-8"; + + string decl(p + 6, decl_end); + size_t enc = decl.find("encoding"); + if (enc == string::npos) break; + + enc = decl.find_first_not_of(" \t\r\n", enc + 8); + if (enc == string::npos || enc == decl.size()) break; + + if (decl[enc] != '=') break; + + enc = decl.find_first_not_of(" \t\r\n", enc + 1); + if (enc == string::npos || enc == decl.size()) break; + + if (decl[enc] != '"' && decl[enc] != '\'') break; + + char quote = decl[enc++]; + size_t enc_end = decl.find(quote, enc); + + if (enc != string::npos) + charset = decl.substr(enc, enc_end - enc); + + break; + } + p++; + } + + // Process text up to start of tag. + if (p > start) { + string text = body.substr(start - body.begin(), p - start); + // convert_to_utf8(text, charset); + decode_entities(text); + process_text(text); + } + + if (p == body.end()) break; + + start = p + 1; + + if (start == body.end()) break; + + if (*start == '!') { + if (++start == body.end()) break; + if (++start == body.end()) break; + // comment or SGML declaration + if (*(start - 1) == '-' && *start == '-') { + ++start; + string::const_iterator close = find(start, body.end(), '>'); + // An unterminated comment swallows rest of document + // (like Netscape, but unlike MSIE IIRC) + if (close == body.end()) break; + + p = close; + // look for --> + while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) + p = find(p + 1, body.end(), '>'); + + if (p != body.end()) { + // Check for htdig's "ignore this bit" comments. + if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { + string::size_type i; + i = body.find("", p + 1 - body.begin()); + if (i == string::npos) break; + start = body.begin() + i + 21; + continue; + } + // If we found --> skip to there. + start = p; + } else { + // Otherwise skip to the first > we found (as Netscape does). + start = close; + } + } else { + // just an SGML declaration, perhaps giving the DTD - ignore it + start = find(start - 1, body.end(), '>'); + if (start == body.end()) break; + } + ++start; + } else if (*start == '?') { + if (++start == body.end()) break; + // PHP - swallow until ?> or EOF + start = find(start + 1, body.end(), '>'); + + // look for ?> + while (start != body.end() && *(start - 1) != '?') + start = find(start + 1, body.end(), '>'); + + // unterminated PHP swallows rest of document (rather arbitrarily + // but it avoids polluting the database when things go wrong) + if (start != body.end()) ++start; + } else { + // opening or closing tag + int closing = 0; + + if (*start == '/') { + closing = 1; + start = find_if(start + 1, body.end(), p_notwhitespace); + } + + p = start; + start = find_if(start, body.end(), p_nottag); + string tag = body.substr(p - body.begin(), start - p); + // convert tagname to lowercase + lowercase_string(tag); + + if (closing) { + closing_tag(tag); + if (in_script && tag == "script") in_script = false; + + /* ignore any bogus parameters on closing tags */ + p = find(start, body.end(), '>'); + if (p == body.end()) break; + start = p + 1; + } else { + // FIXME: parse parameters lazily. + while (start < body.end() && *start != '>') { + string name, value; + + p = find_if(start, body.end(), p_whitespaceeqgt); + + name.assign(body, start - body.begin(), p - start); + + p = find_if(p, body.end(), p_notwhitespace); + + start = p; + if (start != body.end() && *start == '=') { + start = find_if(start + 1, body.end(), p_notwhitespace); + + p = body.end(); + + int quote = *start; + if (quote == '"' || quote == '\'') { + start++; + p = find(start, body.end(), quote); + } + + if (p == body.end()) { + // unquoted or no closing quote + p = find_if(start, body.end(), p_whitespacegt); + } + value.assign(body, start - body.begin(), p - start); + start = find_if(p, body.end(), p_notwhitespace); + + if (!name.empty()) { + // convert parameter name to lowercase + lowercase_string(name); + // in case of multiple entries, use the first + // (as Netscape does) + parameters.insert(make_pair(name, value)); + } + } + } +#if 0 + cout << "<" << tag; + map::const_iterator x; + for (x = parameters.begin(); x != parameters.end(); x++) { + cout << " " << x->first << "=\"" << x->second << "\""; + } + cout << ">\n"; +#endif + opening_tag(tag); + parameters.clear(); + + // In