From: Kunal Mehta Date: Thu, 12 Jan 2023 03:05:31 +0000 (+0000) Subject: Import zimlib_8.1.0+really8.0.0.orig.tar.gz X-Git-Tag: archive/raspbian/8.1.0+really8.0.0-1+rpi1^2~2 X-Git-Url: https://dgit.raspbian.org/?a=commitdiff_plain;h=82aa37e94f05a2d92230ef86e1fefd42640e5c5c;p=zimlib.git Import zimlib_8.1.0+really8.0.0.orig.tar.gz [dgit import orig zimlib_8.1.0+really8.0.0.orig.tar.gz] --- 82aa37e94f05a2d92230ef86e1fefd42640e5c5c diff --git a/.codecov.yml b/.codecov.yml new file mode 100644 index 0000000..21288b7 --- /dev/null +++ b/.codecov.yml @@ -0,0 +1,17 @@ +codecov: + notify: + require_ci_to_pass: yes + +coverage: + status: + project: + default: + threshold: 1% + patch: + default: + target: 90% + threshold: 0% + +ignore: + - "test" + - "examples" diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 0000000..f39dc2a --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,12 @@ +# These are supported funding model platforms + +github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] +patreon: # Replace with a single Patreon username +open_collective: # Replace with a single Open Collective username +ko_fi: # Replace with a single Ko-fi username +tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel +community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry +liberapay: # Replace with a single Liberapay username +issuehunt: # Replace with a single IssueHunt username +otechie: # Replace with a single Otechie username +custom: # https://kiwix.org/support-us/ diff --git a/.github/script/build_libzim.cmd b/.github/script/build_libzim.cmd new file mode 100644 index 0000000..aafa960 --- /dev/null +++ b/.github/script/build_libzim.cmd @@ -0,0 +1,10 @@ +call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat" + +set CC=cl.exe +set CXX=cl.exe + +meson.exe setup build . --force-fallback-for liblzma -Ddefault_library=static -Dwith_xapian=false -Dzstd:bin_programs=false -Dzstd:bin_tests=false -Dzstd:bin_contrib=false -Dliblzma:default_library=static -Dliblzma:enable_xz=false + +cd build + +ninja.exe diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..ee658f8 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,191 @@ +name: CI + +on: [push] + +jobs: + Macos: + strategy: + fail-fast: false + matrix: + target: + - native_dyn + - iOS_arm64 + - iOS_x86_64 + runs-on: macos-latest + steps: + - name: Checkout code + uses: actions/checkout@v1 + - name: Setup python 3.9 + uses: actions/setup-python@v1 + with: + python-version: '3.9' + - name: Install packages + run: | + brew update + brew install gcovr pkg-config ninja || brew link --overwrite python@3.9 + - name: Install python modules + run: pip3 install meson==0.52.1 pytest + - name: Install deps + shell: bash + run: | + ARCHIVE_NAME=deps2_osx_${{matrix.target}}_libzim.tar.xz + wget -O- http://tmp.kiwix.org/ci/${ARCHIVE_NAME} | tar -xJ -C $HOME + - name: Compile + shell: bash + run: | + MESON_OPTION="--default-library=shared" + MESON_CROSSFILE="$HOME/BUILD_${{matrix.target}}/meson_cross_file.txt" + if [[ ! "${{matrix.target}}" =~ native_.* ]]; then + MESON_OPTION="$MESON_OPTION -Db_bitcode=true --cross-file $MESON_CROSSFILE -Dstatic-linkage=true" + cat $MESON_CROSSFILE + fi + export PKG_CONFIG_PATH=$HOME/BUILD_${{matrix.target}}/INSTALL/lib/pkgconfig + meson . build ${MESON_OPTION} + cd build + ninja + - name: Test + if: startsWith(matrix.target, 'native_') + shell: bash + run: | + export LD_LIBRARY_PATH=$HOME/BUILD_${{matrix.target}}/INSTALL/lib:$HOME/BUILD_${{matrix.target}}/INSTALL/lib64 + cd build + ninja download_test_data + meson test --verbose + env: + SKIP_BIG_MEMORY_TEST: 1 + + Windows: + runs-on: windows-2019 + steps: + - name: Checkout code + uses: actions/checkout@v1 + - name: Setup python 3.10 + uses: actions/setup-python@v2 + with: + python-version: '3.10' + - name: Install packages + run: + choco install ninja + - name: Install python modules + run: pip3 install meson + - name: Compile + shell: cmd + run: .github\script\build_libzim.cmd + - name: Test + shell: cmd + run: | + cd build + ninja download_test_data + meson test --verbose + + Linux: + strategy: + fail-fast: false + matrix: + target: + - native_static + - native_dyn + - alpine_dyn + - android_arm + - android_arm64 + - win32_static + - win32_dyn + with_xapian: + - true + - false + include: + - target: native_static + image_variant: bionic + lib_postfix: '/x86_64-linux-gnu' + - target: native_dyn + image_variant: bionic + lib_postfix: '/x86_64-linux-gnu' + - target: android_arm + image_variant: bionic + lib_postfix: '/arm-linux-androideabi' + - target: android_arm64 + image_variant: bionic + lib_postfix: '/aarch64-linux-android' + - target: alpine_dyn + image_variant: alpine + lib_postfix: '/x86_64-linux-musl' + - target: win32_static + image_variant: f35 + lib_postfix: '64' + - target: win32_dyn + image_variant: f35 + lib_postfix: '64' + env: + HOME: /home/runner + runs-on: ubuntu-latest + container: + image: "kiwix/kiwix-build_ci:${{matrix.image_variant}}-32" + steps: + - name: Checkout code + shell: python + run: | + from subprocess import check_call + from os import environ + config_command = [ + 'git', 'config', '--global', + 'http.postBuffer', '1048576000' + ] + check_call(config_command, cwd=environ['HOME']) + clone_command = [ + 'git', 'clone', + 'https://github.com/${{github.repository}}', + '--depth=1', + '--branch', '${{github.ref_name}}' + ] + check_call(clone_command, cwd=environ['HOME']) + - name: Install deps + if: false == startsWith(matrix.target, 'alpine_') + shell: bash + run: | + ARCHIVE_NAME=deps2_${OS_NAME}_${{matrix.target}}_libzim.tar.xz + wget -O- http://tmp.kiwix.org/ci/${ARCHIVE_NAME} | tar -xJ -C /home/runner + - name: Compile + shell: bash + run: | + if [[ "${{matrix.target}}" =~ .*_dyn ]]; then + MESON_OPTION="--default-library=shared" + else + MESON_OPTION="--default-library=static" + fi + if [[ "${{matrix.target}}" =~ native_.* ]]; then + MESON_OPTION="$MESON_OPTION -Db_coverage=true" + elif [[ "${{matrix.target}}" != alpine_* ]]; then + MESON_OPTION="$MESON_OPTION --cross-file $HOME/BUILD_${{matrix.target}}/meson_cross_file.txt" + fi + if [[ "${{matrix.target}}" =~ android_.* ]]; then + MESON_OPTION="$MESON_OPTION -Dstatic-linkage=true -DUSE_BUFFER_HEADER=false" + fi + cd $HOME/libzim + meson . build ${MESON_OPTION} -Dwith_xapian=${{matrix.with_xapian}} + cd build + ninja + env: + PKG_CONFIG_PATH: "/home/runner/BUILD_${{matrix.target}}/INSTALL/lib/pkgconfig:/home/runner/BUILD_${{matrix.target}}/INSTALL/lib${{matrix.lib_postfix}}/pkgconfig" + - name: Test + if: startsWith(matrix.target, 'native_') || startsWith(matrix.target, 'alpine_') + shell: bash + run: | + cd $HOME/libzim/build + ninja download_test_data + meson test --verbose + if [[ "${{matrix.target}}" =~ native_.* ]]; then + ninja coverage + fi + env: + LD_LIBRARY_PATH: "/home/runner/BUILD_${{matrix.target}}/INSTALL/lib:/home/runner/BUILD_${{matrix.target}}/INSTALL/lib${{matrix.lib_postfix}}" + SKIP_BIG_MEMORY_TEST: 1 + - name: Publish coverage + shell: bash + run: | + cd $HOME/libzim + curl https://codecov.io/bash -o codecov.sh + bash codecov.sh -n "${OS_NAME}_${{matrix.target}}" -Z + rm codecov.sh + if: startsWith(matrix.target, 'native_') + env: + CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }} diff --git a/.github/workflows/package.yml b/.github/workflows/package.yml new file mode 100644 index 0000000..208aa3f --- /dev/null +++ b/.github/workflows/package.yml @@ -0,0 +1,114 @@ +name: Packages +on: [push, pull_request] + +jobs: + build-deb: + runs-on: ubuntu-latest + strategy: + fail-fast: false + matrix: + distro: + - debian-unstable + - debian-bullseye + - debian-buster + - ubuntu-kinetic + - ubuntu-jammy + - ubuntu-focal + - ubuntu-bionic + steps: + - uses: actions/checkout@v2 + + # Determine which PPA we should upload to + - name: PPA + id: ppa + run: | + if [[ $REF == refs/tags* ]] + then + echo "::set-output name=ppa::kiwixteam/release" + else + echo "::set-output name=ppa::kiwixteam/dev" + fi + env: + REF: ${{ github.ref }} + + - uses: legoktm/gh-action-auto-dch@master + with: + fullname: Kiwix builder + email: release+launchpad@kiwix.org + distro: ${{ matrix.distro }} + + - uses: legoktm/gh-action-build-deb@debian-unstable + if: matrix.distro == 'debian-unstable' + name: Build package for debian-unstable + id: build-debian-unstable + with: + args: --no-sign + + - uses: legoktm/gh-action-build-deb@debian-bullseye + if: matrix.distro == 'debian-bullseye' + name: Build package for debian-bullseye + id: build-debian-bullseye + with: + args: --no-sign + + - uses: legoktm/gh-action-build-deb@debian-buster + if: matrix.distro == 'debian-buster' + name: Build package for debian-buster + id: build-debian-buster + with: + args: --no-sign + + - uses: legoktm/gh-action-build-deb@ubuntu-kinetic + if: matrix.distro == 'ubuntu-kinetic' + name: Build package for ubuntu-kinetic + id: build-ubuntu-kinetic + with: + args: --no-sign + ppa: ${{ steps.ppa.outputs.ppa }} + + - uses: legoktm/gh-action-build-deb@ubuntu-jammy + if: matrix.distro == 'ubuntu-jammy' + name: Build package for ubuntu-jammy + id: build-ubuntu-jammy + with: + args: --no-sign + ppa: ${{ steps.ppa.outputs.ppa }} + + - uses: legoktm/gh-action-build-deb@ubuntu-focal + if: matrix.distro == 'ubuntu-focal' + name: Build package for ubuntu-focal + id: build-ubuntu-focal + with: + args: --no-sign + ppa: ${{ steps.ppa.outputs.ppa }} + + - uses: legoktm/gh-action-build-deb@ubuntu-bionic + if: matrix.distro == 'ubuntu-bionic' + name: Build package for ubuntu-bionic + id: build-ubuntu-bionic + with: + args: --no-sign + ppa: ${{ steps.ppa.outputs.ppa }} + + - uses: actions/upload-artifact@v2 + with: + name: Packages for ${{ matrix.distro }} + path: output + + - uses: legoktm/gh-action-dput@master + name: Upload dev package + # Only upload on pushes to master + if: github.event_name == 'push' && github.event.ref == 'refs/heads/master' && startswith(matrix.distro, 'ubuntu-') + with: + gpg_key: ${{ secrets.LAUNCHPAD_GPG }} + repository: ppa:kiwixteam/dev + packages: output/*_source.changes + + - uses: legoktm/gh-action-dput@master + name: Upload release package + # Only upload on pushes to master or tag + if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') && startswith(matrix.distro, 'ubuntu-') + with: + gpg_key: ${{ secrets.LAUNCHPAD_GPG }} + repository: ppa:kiwixteam/release + packages: output/*_source.changes diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d319a8f --- /dev/null +++ b/.gitignore @@ -0,0 +1,35 @@ +*~ +*#* +autom4te.cache +build +compile +config.h +configure +depcomp +.deps +.dirstamp +INSTALL +install-sh +*.kate-swp +*.la +.libs +libtool +*.lo +ltmain.sh +*.m4 +Makefile +Makefile.in +missing +*.o +stamp-h1 +.svn +.*.swp +*.zim +examples/createZimExample +src/tools/zimdump +src/tools/zimsearch +libzim.pc +test-driver +test/zimlib-test* +test/test-suite.log +.clangd diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 0000000..2b31fc1 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,17 @@ +# This is the list of Libzim's significant contributors. +# +# This does not necessarily list everyone who has contributed code, +# especially since many employees of one corporation may be contributing. +# To see the full list of contributors, see the revision history in +# source control. + +C. Scott Ananian https://github.com/cscott +Dmitry Atamanov https://github.com/data-man +Emmanuel Engelhart https://github.com/kelson42 +Kunal Mehta https://github.com/legoktm +Maneeshpm https://github.com/maneeshpm +Matthieu Gautier https://github.com/mgautierfr +MiguelRocha https://github.com/miguelrocha +Renaud Gaudin https://github.com/rgaudin +Tommi Mäkitalo https://github.com/maekitalo +Veloman Yunkan https://github.com/veloman-yunkan diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..e2683b5 --- /dev/null +++ b/COPYING @@ -0,0 +1,280 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..08fa831 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,547 @@ +libzim 8.0.0 +============ + + * [API-BREAK] Remove lzma compression support in writer (@veloman-yunkan #718) + * Add new method `zim::Entry::getRedirectEntryIndex()` (@veloman-yunkan #716) + * Add new helper function `zim::setICUDataDirectory()` to help android wrapper + compilation (@mgautierfr #722) + * Fix `std::call_once` usage (alpine bug) (@veloman-yunkan #ê708) + * Better xapian indexation (no transaction, better compact algorithm) (@mgautierfr #719) + * Reserve more space (1968B instead of 944B) for mimetype list (@mgautierfr #720) + * [CI] Fix android compilation in the CI (@veloman-yunkan @mgautierfr #713) + * [CI] Add CI for Alpine (@veloman-yunkan #710) + * [CI] Support checkout of tag in the CI (@teeks99 #696) + * [CI] Remove movebot (@kelson42 704) + * [CI] Remove Impish and add Kinetic packages (@legoktm #715) + * Fix code factor report (@kelson42 #700) + * Fix readme (@kelson42 #701 #716) + +libzim 7.2.2 +============ + + * Change the way we generate search result snippet. + We now ask xapian to generate "less" relevant snippet (even if in practice, + snippets are still good). But it know generate snippet far more quicker. + On cold search, no cache and low IO, search can go from 90s to 3s. + (@mgautier #697) + * [CI] Update base images (@mgautier #695) + +libzim 7.2.1 +============ + + * Make suggestions diacritics insensitive (@veloman-yunkan #691) + * [Writer] Raise an exception when user add a invalid entry (duplicate path) + instead of printing a message (which can be too easily missed) and be buggy + (@mautierfr #690) + * [Writer] Do not `hasIndexData` and `getTitle` in the main thread when we add + an entry (@mgautier #684) + * [Writer] Properly clean and stop the writer even if user hasn't call + `finishZimCreation` (The created zim file is still invalid) (@veloman-yunkan #666) + * Add a default argument value for mimetype of `creator::addMetadata` + (@kelvinhammond #678) + * Use a more informative message in exception when we cannot open a file + (@veloman-yunkan #667 #668) + * Use a generic dirent lookup to search by title (@veloman-yunkan #651) + * Various improvements: + - CI, Packaging : Stop creating packages for Ubuntu Hirsute (@legoktm #664) + - Update Readme (@TheDuchy #660) + - Fix cross-compilation host machine detection (@kelson42 #665) + - Fix macos/ios compilation (@mgautierfr #672) + - Update documentation @mgautierfr #677, @veloman-yunkan #682 + + +libzim 7.2.0 +============ + + * Add methods to get/print (dependences) versions (@kelson42, #452) + * Fix Emscripten compilation (@kelson42, @mossroy, #643) + +libzim 7.1.0 +============ + + * Fix dirent test on 32 bits architectures (@mgautierfr #632) + * Fix compilation on Alpine - with musl (@amirouche #649) + * Don't crash if ZIM without illustration nor X/W namespace (@mgautierfr #641) + * Switch default suggestion operator to AND (@maneeshpm #644) + * Add a new method Archive::getMetadataItem (@mgautierfr #639) + * Better indexion criterias (@mgautierfr #642) + * Avoid duplicated archives in the searcher (@veloman-yunkan #648) + * Fix random entry (@veloman-yunkan #650) + * Various improvements. + - CI @mgautierfr #640, @kelson42 #638, @legoktm #654 + - Doc @rgaudin #646 + +libzim 7.0.0 +============ + +Version 7.0.0 is a major release. + +The API has been completely rewritten. +Most notable change is that namespaces are now hidden. +The new API is described in documentation, which includes a Transition Guide from v6. + +ZIM files created with it uses new ZIM minor version (6.1 - see Header section of spec.) +Both backward and forward compatibility is kept. + +Improvements +------------ + + * Rewrite creator and reader API + This removes the namespace from the API. Article are automatically put in + the right namespace ('A') and the retrivial of content is made using + specific API. (@mgautier #454) + * Better handling of the conditional compilation without xapian. + Before that, the search API was present (but returning empty result) if + libzim was compiled without xapian. Now the API is not present anymore. + User code must check if libzim is compiled with xapian or not by checking + if LIBZIM_WITH_XAPIAN is defined or not. (@mgautierfr #465) + * Add a new specific listing in zim files to list entries considered as "front + article". At creation, wrapper MUST pass the hint `FRONT_ARTICLE` to + correctly mark the entry. Search by title uses this list if present. + (@mgautierfr #487) + * Store the wellknown entries in the `W` namespace (`W/mainPage`) + (@mgautierfr #497) + * Rewrite Search API. Fix potential memory link and allow correct reusing of + create search. (@mgautierfr #530) + * New suggestion search API. The api mimics the Search API but specialized + for suggestion (@maneeshpm #574) + * Add `zim::Archive` constructors to open an archive using a existing file descriptor. + This API is not available on Windows. (@veloman-yunkan #449) + * Make zstd the default compression algorithm (@veloman-yunkan #480) + * The method `zim::Archive::checkIntegrity` now if the mimetypes indicated in the + dirents are correct (@veloman-yunkan #505) + * Writer doesn't add a `.zim` extension to the given path. (@maneeshpm #503) + * Implement random entry picking. We are choosing a entry from the "front + article" list if present. (@mgautierfr #476) + * Creator now create the `M/Counter` metadata. (@mgautierfr) + * Better Illustration handling. Favicon is replaced by Illustration. + Illustration can now have different size and scale (even if the API do + not use this feature) (@mgautierfr #540) + * Search iterator now have a method `getZimId` to know the Id of the zim + corresponding to the result (useful for multizim search) (@maneeshpm #557) + +Bug fixes +--------- + + * The method `zim::Archive::checkIntegrity` now check if the dirents are + correctly sorted. (@veloman-yunkan #448) + * Handle large MIME-type list. Some zim file may have a pretty large mimetype + list. (@veloman-yunkan #460) + * Fix handling of zim file containing item of size 0. (@mgautierfr #483) + * Better parsing of the entry paths to detect the namespace (@maneeshpm #479) + * Fix zim file creation on Windows (@mgautierfr #508) + * Better algorithm tunning for suggestion search (@maneeshpm #492) + * The default indexer now index html content only. (@mgautierfr #511) + * Better suggestion search : Don't use stopwords, use OP_PHRASE + (@maneeshpm #501) + * Remove duplicate in the suggestion search (@maneeshpm #515) + * Remove the termlist from the xapian database, lower memory usage + (@maneeshpm #528) + * Add a anchor in the suggestion search to search term at the beginning of + the title (@maneeshpm #526) + * Make the suggestion search working with special characters (`&`, `+`) + (@veloman-yunkan #534) + * Fix creator issue not detecting that cluster must be extended if it + contains only 32-bit-sized content. (@veloman-yunkan #552) + * Correctly generate suggestion snippet. (@maneeshpm #545) + * Better cluster size configuration (@mgautierfr #555) + * Make search iterator `getTitle` return the real title of the entry and not + the one stored in the xapian database (caseless) (@maneeshpm #586) + * Correcly close a zim creator to avoid a crash when the creator is + destructed without being started (@mgautierfr #613) + * Reduce the creator memory usage by reducing the memory size of the dirent + (@mgautier #616, #628) + * Write the cluster using a bigger chunk size for performance + (@mgautierfr #506) + * Change the default cluster size to 2MiB (@mgautierfr #585) + * The default mimetype for metadata now include the utf8 chardet + (@rgaudin #626) + * Improve the estimation of the number of search/suggestion results by forcing + Xapian to evaluate at least 10 results (@mgautier #625) + +Other +----- + + * Update xapian stopwords list. (@data-man #447) + * Remove direct pthread dependency (use c++11 thread library). (@mgautierfr #443) + We still need pthread library on linux and freebsd as C++11 is using it internally. + * [CI] Make the libzim CI compile libzim natively on Windows (@mgautierfr #453). + * [CI] Build libzim package for Ubuntu Hirsute and Impish + (@legoktm #459, #580) + * Always create zim file using the major version 6. (@mgautierfr #512) + * Move the test data files out of the git repository. Now test files are + stored in `zim-testing-suite` repository and must be downloaded. + (@mgautierfr #538, #535) + * Add search iterator unit test (@maneeshpm #547) + * Correctly fix search iterator method case to use camelCase everywhere + (@maneeshpm #563) + * Add a cast to string opertor on Uuid (@maneeshpm #582) + * Make unittest print the path of the missing zim file when something goes + wrong (@kelson42 #601) + * Delete temporary data (index) after we called `finishZimCreation` instead of + waiting for creator destruction. (@mgautierfr #603) + * Add basic user documentation (@mgautierfr #611) + +Known bugs +---------- + +Suggestion system using in current libkiwix doesn't work with new zim files +created with this release (and future ones). +New libkiwix version will be fixed and will work with new and old zim files. + + +libzim 6.3.2 +============ + +This is a hotfix of 6.3.0 : + * libzim now create zimfile with zstd compression 19 instead of 22. + So new libzim do not need to allocate 128Mb per cluster at decompression + time. + * At reading time, on 32 bits architectures, zstd cluster are not keep in + cache. This avoid use to also keep the decompression stream which reserve + 128Mb of memory address. + +libzim 6.3.1 +============ + +The release process of 6.3.1 was buggy. So, no 6.3.1. + + +libzim 6.3.0 +============ + + * Rewrite internal reader structure to use stream decompression. + This allow libzim to not decompresse the whole cluster to get an article + content. This is big performance improvement, it speedups random access by + 2, with a very small cost when doing "full" incremental reading + (zim-check/zim-dump). (@veloman-yunkan) + * Better dirent lookup. + Dirent lookup is the process of locating article data starting from the url + or title. This improves reading of zim file up to 10% (@veloman-yunkan) + * Add basic, first version of `validate` function to check internal structure + of a zim file. (@veloman-yunkan, @MiguelRocha) + * Fix compilation of libzim without xapian (@mgautierfr) + * Remove zlib dependency (and support of very old files created using zlib + compression) (@mgautierfr) + * New unit tests and various small fixes. + + +libzim 6.2.2 +============ + + * Check blob index before access it in the cluster. + * Refactoring of the cluster reading. + +libzim 6.2.1 (release process broken) +===================================== + + * Update readme and add link to repology.org packages list. + * Fix compilation on windows. + +libzim 6.2.0 +============ + + * Fix compilation of libzim on freebsd. + * Rewrite unit tests to remove python based test and use gtest all the time. + * Make libzstd mandatory. + * Support for meson 0.45. + * Fix multipart support on macos. + * Add a documentation system. + * Better cache system implementation (huge speed up). + * Various (and numerous) small refactoring. + + +libzim 6.1.8 +============ + + * Increase default timeout for test to 120 seconds/test + * Compression algorithm to use can be passed to `zim::writer::Creator` + * Add automatic debian packaging of libzim. + * Fix using of tmpdir (and now use env var TMPDIR) during tests. + + +libzim 6.1.7 +============ + + * Do not assume urlPtrPos is just after the mimetype list. + * Fix compilation of compression test. + * Do not exit but throw an exception if an ASSERT is not fulfill. + +libzim 6.1.6 +============ + + * Better (faster) implementation of the ordering of article by cluster. + * Fix compression algorithm. + +libzim 6.1.5 +============ + + * [Writer] Remove unused declaration of classes. + Those classes were not implemented nor used at all. + +libzim 6.1.4 +============ + + * [Writer] Fix excessive memory usage. Data of the cluster were clean at the + end of the process, not once we don't need it. + +libzim 6.1.3 +============ + + * [Writer] Use a `.tmp` suffix and rename to `.zim` at the end of the write + proces. + * Add unit tests + * Do not include uncessary `windows.h` headers in public zim's headers. + +libzim 6.1.2 +============ + + * [CI] Fix codecov configuration + * [Writer] Fix threads synchronization at end of writing process. + +libzim 6.1.1 +============ + + * Fix bug around the find function + +libzim 6.1.0 +============ + + * Compile now on OpenBSD + * [Test] Use the main function provided by gtest. + * [CI] Move the CI compilation to github actions. + * Add stopwords for 54 new languages. + * [Writer] Improve the way we are writing cluster at zim creation time. + - Clusters are directly written in the zim file instead of using temporary + files. + - mimetypes are limited to 944 bytes. + * Add a new type of iterator to iterate over articles in a performant way + reducing decompression of clusters. This is now the new default iterator. + * Add support for zim files compressed with zstd compression algorithm. + This is not possible to use zstd to create zim file for now. + +libzim 6.0.2 +============ + + * Fix search suggestion parsing. + +libzim 6.0.1 +============ + + * Fix crash when trying to open an empty file. + * Ensure that pytest tests are run on the CI. + +libzim 6.0.0 +============ + + * [Writer] Index the articles in differents threads. This is a huge speed + improvement as the main thread in not blocked by indexing. + * Index the title only if `shouldIndex` return true. + +libzim 5.1.0 +============ + + * Improve indexation of the title. + * Better pertinence of suggestions (only for new zim files) + * Improvement of the speed of Leveinstein distance for suggestions (for old + zims) + +libzim 5.0.2 +============ + + * Improve README. + * Remove gtest as embeded subproject. + * Better lzma compression. + * Better performance of the leveinstein algorithm (better suggestions + performance) + +libzim 5.0.1 +============ + + * Update README. + * [Writer] Add debug information (print progress of the clusters writing). + * [Writer] Correctly print the url to the user. + * [CI] Add code coverage. + +libzim 5.0.0 +============ + + * Fix thread slipping for win32 crosscompilation. + * Fix a potential invalid access when reading dirent. + * Fix memory leak in the decompression algorithm. + * [Writer] Fix a memory leak (cluster cleanning) + * [Writer] Write article data in a temporary cluster file instead of a + temporary file per article. + * [Writer] Better algorithm to store the dirent while creating the zim + file. Better memory usage. + * [Writer] [API Change] Url/Ns are now handle using the same struct Url. + * [Writer] [API Change] No more aid and redirectAid. A redirectArticle + have to implement redirectUrl. + * [Writer] Use a memory pool to avoid multiple small memory allocations. + * [Writer] [API Change] Rename `ZimCreator` to `Creator`. + * [API Change] File's `search` and `suggestions` now return a unique_ptr + instead of a raw pointer. + +libzim 4.0.7 +============ + + * Build libzim without rpath. + +libzim 4.0.6 +============ + + * Support zim file created with cluster not written sequentially. + * Remove a meson warning. + +libzim 4.0.5 +============ + + * Store the xapian database in the right url. + * Do not fail when reading very small zim file (<256b). + * Do not print message on normal behavior. + * [BUILDSYSTEM] Be able to build a dynamic lib (libzim.so) but using static + dependencies. + * [CI] Use last version of meson. + * [CI] Use the new deps archive xz + +libzim 4.0.4 +============ + + * Fix opening of multi-part zim. + * Fix convertion of path to wpath on Windows. + +libzim 4.0.3 +============ + + * Implement low level file manipilation using different backends + +libzim 4.0.2 +============ + + * [Windows] Fix opening of zim file bigger than 4GiB + +libzim 4.0.1 +============ + + * [Writer] Fix wrong redirectyon log message + * Make libzim compile natively on windows using MSVC + * Better message when failing to read a zim file. + * Make libzim on windows correctly open unicode path. + * Add compilation option to use less memory (but more I/O). + Usefull on low memory devices (android) + * Small fixes + +libzim 4.0.0 +============ + + * [Writer] Remove a lot of memory copy. + * [Writer] Add xapian indexing directly in libzim. + * [Writer] Better API. + * [Writer] Use multi-threading to write clusters. + * [Writer] Ensure mimetype of articles article is not null. + * Extend test timeout for cluster's test. + * Less memory copy for cluster's test. + * Allow skipping test using a lot memory using env variable + `SKIP_BIG_MEMORY_TEST=1` + * Explicitly use the icu namespace to allow using of packaged icu lib. + * Use a temporary file name as long as the ZIM writting process is + not finished (#163) + * [Travis] Do no compile using gcc-5 (but the default trusty's one 4.8) + +libzim 3.3.0 +============ + + * Fix handling of big cluster (>4GiB) on 32 bits architecture. This is mainly + done by : + * Do not mmap the whole cluster by default. + * MMap only the memory asociated to an article. + * If an article is > 4GiB, the blob associated to it is invalid + (data==size==0). + * Other information are still valid (directAccessInformation, ...) + * Fix writing of extended cluster in writer. + * Compile libzim on macos. + * Build libzim setting RPATH. + * Search result urls are now what is stored in the zim file. They should not + start with a `/`. This is a revert of the change made in last release. + (See kiwix/kiwix-lib#123) + * Spelling corrections in README. + +libzim 3.2.0 +============ + + * Support geo query if the xapian database has indexed localisation. + * Handle articles bigger than 4Go in the zim file (#110). + * Use AND operator between search term. + * Fix compilation with recent clang (#95). + * Add method to get article's data localisation in the zim file. + * Be able to get only a part of article (#77). + * Do not crash if we cannot open the xapian Database for some reasons. + (kiwix/kiwix-tools#153) + * Do not assumen there is always a checksum in the zim file. + (kiwix/kiwix-tools#150) + * Try to do some sanity checks when opening a zim file. + * Use pytest to do some tests (when cython is available). + * Use levenshtein distance to sort and have better suggestion results. + * Search result urls are now always absolute (starts with a '/'). + (kiwix/kiwix-lib#110) + * Open the file readonly when checking the zim file (and so be able to check + read only file). + * Accept absolute url starting with '/' when searching for article. + * Fix various bugs + +libzim 3.1.0 +============ + + * Lzma is not a optional dependency anymore. + * Better handle (report and not crash) invalid zim file. + * Embed source of gtest (used only if gtest is not available on the system) + * Move zimDump tools out of libzim repository to zim-tools + * ZimCreator tools doesn't not read command line to set options. + +libzim 3.0.0 +============ + +This is a major change of the libzim. +Expect a lot new improvement and API changes. + + * Add a suggestion mode to the search + * Fix licensing issues + * Fix wrong stemming of the query when searching + * Deactivate searching (and so crash) in the embedded database if the zim is + splitted + * Rewrite the low level memory management of libzim when reading a zim file: + * We use a buffer base entity to handle memory and reading file instead of + reading file using stream. + * MMap the memory when posible to avoid memory copy. + * Use const when posible (API break) + * Move to googletest instead of cxxtools for unit-tests. + * Fix endiannes bug on arm. + * Do not install private headers. Those headers declare private structure and + should not be visible (API break) + * Compile libzim with `-Werror` and `-Wall` options. + * Make libzim thread safe for reading article. + The search part is not thread safe, and all search operation must be + protected by a lock. + * Add method to get only a part of a article. + * Move some tools to zim-tools repository. + + +libzim 2.0.0 +============ + + * Move to meson build system + `libzim` now use `meson` as build system instead of `autotools` + * Move to C++11 standard. + * Fulltext search in zim file. + We have integrated the xapian fulltext search in libzim. + So now, libzim provide an API to search in a zim containing embeded fulltext + index. This means that : + *libzim need xapian as (optional) dependencies (if you want compile with + xapian support). + * The old and unused search API has been removed. + * Remove bzip2 support. + * Remove Symbian support. + * Few API hanges + * Make some header files private (not installed); + * A `Blob` can now be cast to a `string` directly; + * Change a lot of `File` methods to const methods. diff --git a/README.md b/README.md new file mode 100644 index 0000000..da103c9 --- /dev/null +++ b/README.md @@ -0,0 +1,213 @@ +Libzim +====== + +The Libzim is the reference implementation for the [ZIM file +format](https://wiki.openzim.org/wiki/ZIM_file_format). It's a [software +library](https://en.wikipedia.org/wiki/Library_(computing)) to read +and write ZIM files on many systems and architectures. More +information about the ZIM format and the openZIM project at +https://openzim.org/. + +[![Release](https://img.shields.io/github/v/tag/openzim/libzim?label=release&sort=semver)](https://download.openzim.org/release/libzim/) +[![Repositories](https://img.shields.io/repology/repositories/libzim?label=repositories)](https://github.com/openzim/libzim/wiki/Repology) +[![License](https://img.shields.io/badge/License-GPL%20v2-blue.svg)](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) +[![Build](https://github.com/openzim/libzim/workflows/CI/badge.svg?query=branch%3Amaster)](https://github.com/openzim/libzim/actions?query=branch%3Amaster) +[![Doc](https://readthedocs.org/projects/libzim/badge/?style=flat)](https://libzim.readthedocs.io/en/latest/?badge=latest) +[![Codecov](https://codecov.io/gh/openzim/libzim/branch/master/graph/badge.svg)](https://codecov.io/gh/openzim/libzim) +[![CodeFactor](https://www.codefactor.io/repository/github/openzim/libzim/badge)](https://www.codefactor.io/repository/github/openzim/libzim) + +Disclaimer +---------- + +This document assumes you have a little knowledge about software +compilation. If you experience difficulties with the dependencies or +with the Libzim compilation itself, we recommend to have a look to +[kiwix-build](https://github.com/kiwix/kiwix-build). + +Preamble +-------- + +Although the Libzim can be compiled/cross-compiled on/for many +systems, the following documentation explains how to do it on POSIX +ones. It is primarily though for GNU/Linux systems and has been tested +on recent releases of Ubuntu and Fedora. + +Dependencies +------------ + +The Libzim relies on many third party software libraries. They are +prerequisites to the Kiwix library compilation. Following libraries +need to be available: +* [LZMA](https://tukaani.org/lzma/) (package `liblzma-dev` on Ubuntu) +* [ICU](http://site.icu-project.org/) (package `libicu-dev` on Ubuntu) +* [Zstd](https://facebook.github.io/zstd/) (package `libzstd-dev` on Ubuntu) +* [Xapian](https://xapian.org/) - optional (package `libxapian-dev` on Ubuntu) + +To test the code: +* [Google Test](https://github.com/google/googletest) (package `googletest` on Ubuntu) +* [ZIM Testing Suite](https://github.com/openzim/zim-testing-suite) - Reference test data set + +To build the documentations you need the packages: +* [Doxygen](https://www.doxygen.nl) +* Python packages for [Sphinx](https://www.sphinx-doc.org), [Sphinx rtd theme](https://github.com/readthedocs/sphinx_rtd_theme), [Breathe](https://breathe.readthedocs.io) and [Exhale](https://exhale.readthedocs.io) (packages `Sphinx`, `sphinx_rtd_theme`, `Breathe` and `Exhale` while using pip) + +These dependencies may or may not be packaged by your operating +system. They may also be packaged but only in an older version. The +compilation script will tell you if one of them is missing or too old. +In the worse case, you will have to download and compile a more recent +version by hand. + +If you want to install these dependencies locally, then ensure that +Meson (through `pkg-config`) will properly find them. + +Environment +------------- + +The Libzim builds using [Meson](https://mesonbuild.com/) version +0.43 or higher. Meson relies itself on Ninja, Pkg-config and few other +compilation tools. Install them first: +* Meson +* Ninja +* Pkg-config + +These tools should be packaged if you use a cutting edge operating +system. If not, have a look to the [Troubleshooting](#Troubleshooting) +section. + +Compilation +----------- + +Once all dependencies are installed, you can compile Libzim with: +```bash +meson . build +ninja -C build +``` + +By default, it will compile dynamic linked libraries. All binary files +will be created in the `build` directory created automatically by +Meson. If you want statically linked libraries, you can add +`--default-library=static` option to the Meson command. + +If you want to build the documentation, we need to pass the +`-Ddoc=true` option and run the `doc` target: +```bash +meson . build -Ddoc=true +ninja -C build doc +``` + +Depending on your system, `ninja` command may be called `ninja-build`. + +By default, Libzim tries to compile with Xapian (and will generate an +error if Xapian is not found). You can build without Xapian by +passing the option `-Dwith_xapian=false` : +```bash +meson . build -Dwith_xapian=false +ninja -C build doc +``` + +If Libzim is compiled without Xapian, all search API are removed. You +can test if an installed version of Libzim is compiled with or without +xapian by testing the define `LIBZIM_WITH_XAPIAN`. + +Testing +------- + +ZIM files needed by unit-tests are not included in this repository. By +default, Meson will use an internal directory in your build directory, +but you can specify another directory with option `test_data_dir`: +```bash +meson . build -Dtest_data_dir= +``` + +Whatever you specify a directory or not, you need a extra step to +download the data. At choice: +* Get the data from the repository + [openzim/zim-testing-suite](https://github.com/openzim/zim-testing-suite) + and put it yourself in the directory. +* Use the script + [download_test_data.py](scripts/download_test_data.py) which will + download and extract the data for you. +* As `ninja` to do it for you with `ninja download_test_data` once the + project is configured. + +The simple workflow is: +```bash +meson . build # Configure the project (using default directory for test data) +cd build +ninja # Build +ninja download_test_data # Download the test data +meson test # Test +``` + +It is possible to deactivate all tests using test data zim files by +passing `none` to the `test_data_dir` option: +```bash +meson . build -Dtest_data_dir=none +cd build +ninja +meson test # Run tests but tests needing test zim files. +``` + +If the automated tests fail or timeout, you need to be aware that some +tests need up to 16GB of memory. You can skip those specific tests with: +```bash +SKIP_BIG_MEMORY_TEST=1 meson test +``` + +Installation +------------ + +If you want to install the Libzim and the headers you just have +compiled on your system, here we go: +```bash +ninja -C build install +``` + +You might need to run the command as root (or using `sudo`), depending +where you want to install the libraries. After the installation +succeeded, you may need to run ldconfig (as root). + +Uninstallation +------------ + +If you want to uninstall the Libzim: +```bash +ninja -C build uninstall +``` + +Like for the installation, you might need to run the command as root +(or using `sudo`). + +Troubleshooting +--------------- + +If you need to install Meson "manually": +```bash +virtualenv -p python3 ./ # Create virtualenv +source bin/activate # Activate the virtualenv +pip3 install meson # Install Meson +hash -r # Refresh bash paths +``` + +If you need to install Ninja "manually": +```bash +git clone git://github.com/ninja-build/ninja.git +cd ninja +git checkout release +./configure.py --bootstrap +mkdir ../bin +cp ninja ../bin +cd .. +``` + +If the compilation still fails, you might need to get a more recent +version of a dependency than the one packaged by your Linux +distribution. Try then with a source tarball distributed by the +problematic upstream project or even directly from the source code +repository. + +License +------- + +[GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) or +later, see [COPYING](COPYING) for more details. diff --git a/debian/changelog b/debian/changelog new file mode 100644 index 0000000..9385d6f --- /dev/null +++ b/debian/changelog @@ -0,0 +1,5 @@ +libzim (0.0.0) unstable; urgency=medium + + * Initial release. + + -- Kunal Mehta Tue, 02 Jun 2020 01:49:48 -0700 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..512b9aa --- /dev/null +++ b/debian/control @@ -0,0 +1,67 @@ +Source: libzim +Section: libs +Priority: optional +Build-Depends: debhelper-compat (= 13), + liblzma-dev, + libicu-dev, + libxapian-dev, + libzstd-dev, + uuid-dev, + libgtest-dev, + meson, + ninja-build, + pkg-config +Maintainer: Kiwix team +Homepage: https://www.openzim.org/wiki/Libzim +Standards-Version: 4.4.1 +Rules-Requires-Root: no + +Package: libzim7 +Architecture: any +Multi-Arch: same +Depends: ${misc:Depends}, + ${shlibs:Depends} +Pre-Depends: ${misc:Pre-Depends} +Conflicts: libzim0, libzim0v5, libzim2, libzim4, libzim5 +Replaces: libzim0, libzim0v5, libzim2, libzim4, libzim5 +Description: library implementation of ZIM specifications + ZIM (Zeno IMproved) is an open file format for storing the contents of + wiki for offline usage. This file format is primarily focused on + providing the contents of Wikipedia and Wikimedia projects for offline + use. + . + libzim is the standard implementation of ZIM specification, which + implements the read and write method for ZIM files. + . + ZIM is a file format created with focus on extracting and encoding data + from Mediawiki for offline use. + . + Features of libzim are: + * Native, coded in C++ + * Extremely fast + * Minimal footprint + * Minimal dependencies + * Portable on most OS (Windows, Linux, Mac OS X) + +Package: libzim-dev +Section: libdevel +Architecture: any +Depends: ${misc:Depends}, + libzim7 (= ${binary:Version}), + liblzma-dev, + libxapian-dev, + libicu-dev, + libzstd-dev +Description: library implementation of ZIM specifications (development) + ZIM (Zeno IMproved) is an open file format for storing the contents of + wiki for offline usage. This file format is primarily focused on + providing the contents of Wikipedia and Wikimedia projects for offline + use. + . + libzim is the standard implementation of ZIM specification, which + implements the read and write method for ZIM files. + . + ZIM is a file format created with focus on extracting and encoding data + from Mediawiki for offline use. + . + This package contains development files. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..ff46366 --- /dev/null +++ b/debian/copyright @@ -0,0 +1 @@ +See COPYING in the repository root. diff --git a/debian/libzim-dev.install b/debian/libzim-dev.install new file mode 100644 index 0000000..1c1f0c5 --- /dev/null +++ b/debian/libzim-dev.install @@ -0,0 +1,3 @@ +usr/include/* +usr/lib/*/libzim.so +usr/lib/*/pkgconfig/* \ No newline at end of file diff --git a/debian/libzim7.install b/debian/libzim7.install new file mode 100644 index 0000000..146d0ad --- /dev/null +++ b/debian/libzim7.install @@ -0,0 +1 @@ +usr/lib/*/*.so.* \ No newline at end of file diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..67b325d --- /dev/null +++ b/debian/rules @@ -0,0 +1,15 @@ +#!/usr/bin/make -f +export DEB_BUILD_MAINT_OPTIONS = hardening=+all + +# Skip some extremely memory-intensive tests +export SKIP_BIG_MEMORY_TEST=1 +%: + dh $@ --buildsystem=meson + +# Skip tests that require zim-testing-data for now +override_dh_auto_configure: + dh_auto_configure -- -Dtest_data_dir=none + +# Increase test timeout +override_dh_auto_test: + dh_auto_test -- -t 3 diff --git a/debian/source/format b/debian/source/format new file mode 100644 index 0000000..89ae9db --- /dev/null +++ b/debian/source/format @@ -0,0 +1 @@ +3.0 (native) diff --git a/docs/.gitignore b/docs/.gitignore new file mode 100644 index 0000000..3d8a6cd --- /dev/null +++ b/docs/.gitignore @@ -0,0 +1,2 @@ +api +xml diff --git a/docs/6to7.rst b/docs/6to7.rst new file mode 100644 index 0000000..e897c97 --- /dev/null +++ b/docs/6to7.rst @@ -0,0 +1,369 @@ + +Libzim 7 transition guide +========================= + + +Libzim7 change a lot of things in the API and in the way we use namespaces (reflected in the API changes). +This part is a document helping to do the transition from libzim6 to libzim7. + +Namespace handling +------------------ + +In libzim6 namespaces were exposed to the user. It was to the user to handle them correctly. +Libzim6 was not doing any assumption about the namespaces. +However, the usage (mainly from libkiwix) was to store metadata in ``M`` namespace, articles in ``A`` and image/video in ``I``. + +On the opposite side, libzim7 hides the concept of namespace and handle it for the user. +While namespaces are still present and used in the zim format, they have vanished from the libzim api. +For information (but it is not important to use libzim), we now store all "user content" in ``C`` namespace. +Metadata are stored in ``M`` namespace and we use few other (``X``, ``W``) for some internal content. + +"User content" are accessed using "classic" method to get content. +Metadata, illustration and such are accessed using specific method. + +An article stored in ``A`` namespace before ("A/index.html") is now accessed simply using "index.html". +(It is stored in "C/index.html" in new format, but you must not specify the namespace in the new api). + +Compatibility +------------- + +libzim6 is agnostic about the namespaces. They are exposed to the user, whatever if we are +reading a new or old zim file. It is up to the user to correctly handle namespaces +(mainly, content are now in ``C`` instead of ``A``/``I``). + +libzim7 tries to be smart about the transition. It will look in the right namespace, depending +of the zim file. +Accessing "index.html" should work whatever if we use old or new namespace scheme. + +Accessing article/entry +----------------------- + +Getting one entry +................. + + +In libzim6 accessing an ``Article`` was done using a ``File`` instance. +You then had to check for the `Article` validity before using it. + + .. code-block:: c++ + + auto f = zim::File("foo.zim"); + auto a = f.getArticleByUrl("A/index.html"); + if (!a.good()) { + std::cerr << "No article "A/index.html" << std::endl; + } + +In libzim7 you access a |Entry| using a |Archive| instance. +If there the entry is not found, a exception is raised. + + .. code-block:: c++ + + auto a = zim::Archive("foo.zim"); + try { + auto e = a.getEntryByPath("index.html"); + } catch (zim::EntryNotFound& e) { + std::cerr << "No entry "index.html" << std::endl; + } + + +Redirection +........... + + +Article in libzim6 may be a redirection to another article or a article containing data. +You had to check the kind of the article before using the right set of method. +Using a method on a wrong kind was undefined behavior. + + .. code-block:: c++ + + auto article = [...]; + if (article.isRedirect()) { + auto target = article.getRedirectArticle(); + } else { + auto blob = article.getData(); + } + + +In libzim7, |Entry| is a kind of intermediate structure, either redirecting to another entry or a item. +A |Item| is the structure containing the data. + + .. code-block:: c++ + + auto entry = [...]; + if (entry.isRedirect()) { + auto target = entry.getRedirectEntry(); + } else { + auto item = entry.getItem(); + auto blob = item.getData(); + } + + +As a common usage is to get the item associated to the entry while resolving the redirection chain, +it is possible to do this easily : + +.. code-block:: c++ + + auto entry = [...]; + // Resolve any redirection chain and return the final item. + auto item = entry.getItem(true); + auto blob = item.getData() + +Iteration +......... + +To iterate on article with libzim6 you had to use the ``begin*`` method to get a iterator. +You may iterate until ``end()`` was reached. + + .. code-block:: c++ + + auto file = [...]; + for(auto it = file.beginByUrl(); it!=file.end(); it++) { + auto article = *it; + [...] + } + + +If you wanted to iterate on article starting by a url prefix it was a bit more complex : + + .. code-block:: c++ + + auto file = [...]; + auto it = file.find("A/ind"); + while(!it.is_end() && it->getUrl().startWith("A/ind")) { + auto article = *it; + [...] + it++; + } + + +In libzim7 you get |EntryRange| on which you can easily iterate on: + + .. code-block:: c++ + + auto archive = [...]; + for(auto entry : archive.iterByPath()) { + [...] + } + + .. code-block:: c++ + + auto archive = [...]; + for(auto entry : archive.findByPath("ind")) { + [...] + } + +Searching +--------- + +In libzim6 searching was made the only class ``Search`` + + .. code-block:: c++ + + auto f = zim::File("foo.zim"); + auto search = zim::Search(&f); + search.set_query("bar"); + search.set_range(10, 30); + for (auto it =search.begin(); it!=search.end(); it++) + { + std::cout << "Found result " << it.get_url() << std::endl; + } + +In libzim7 you search starting from a |Searcher|. + + .. code-block:: c++ + + // Create a searcher, something to search on an archive + zim::Searcher searcher(archive); + + // We need a query to specify what to search for + zim::Query query; + query.setQuery("bar"); + + // Create a search for the specified query + zim::Search search = searcher.search(query); + + // Now we can get some result from the search. + // 20 results starting from offset 10 (from 10 to 30) + zim::SearchResultSet results = search.getResults(10, 20); + + // SearchResultSet is iterable + for(auto entry: results) { + std::cout << entry.getPath() << std::endl; + } + +While it may seems a bit more complex (and it is), it has the main advantage to allow +reusing of the different instance : + +- |Searcher| is what we are searching on, we can do several search on it without recreating a internal xapian database. +- |Query| is what we are searching for. +- |Search| is a specific search. +- |SearchResultSet| is a set of result for a |Search|, it allow getting particular results without having to search several times. + +Suggestion +---------- + +In libzim6 suggestion was made using the same class ``Search`` but by setting the suggestion mode before +iterating on the results. + + .. code-block:: c++ + + auto f = zim::File("foo.zim"); + auto search = zim::Search(&f); + search.set_query("bar"); + search.set_range(10, 30); + search.set_suggestion_mode(true); // <<< + for (auto it =search.begin(); it!=search.end(); it++) + { + std::cout << "Found result " << it.get_url() << std::endl; + } + +If the zim file had no suggestion database, the suggestion search was made on full text database +(with variable results). + +In libzim7 you do suggestion using |SuggestionSearcher| API : + + .. code-block:: c++ + + // Create a searcher, something to search on an archive + zim::SuggestionSearcher searcher(archive); + + // Create a search for the specified query + zim::SuggestionSearch search = searcher.search("bar"); + + // Now we can get some result from the search. + // 20 results starting from offset 10 (from 10 to 30) + zim::SuggestionResultSet results = search.getResults(10, 20); + + // SearchResultSet is iterable + for(auto entry: results) { + std::cout << entry.getPath() << std::endl; + } + + +Creating a zim file +------------------- + +Creating a zim file with libzim6 was pretty complex. +One had to inherit the ``zim::writer::Creator`` to provide the main url. +Then it had to inherit from ``zim::writer::Article`` to be able to add different kind of article to the zim file. + + .. code-block:: c++ + + class MyCreator: public zim::writer::Creator { + Url getMainUrl() const { return Url('A', "index.html"); } + }; + + class RedirectArticle : public zim::writer::Article { + public: + RedirectArticle(const std::string& title, const std::string& url, const std::string& target) + : title(title), + url(url), + target(target) + {} + + bool isRedirect() const { return true; } + zim::writer::Url getUrl() const { return url; } + std::string getTitle() const { return title; } + zim::writer::Url getRedirectUrl() const { return target; } + + private: + std::string title; + std::string url; + std::string target; + }; + + class ContentArticle: public zim::writer::Article { + ContentArticle(const std::string& title, const std::string& url, const std::string& mimetype, const std::string& content) + : title(title), + url(url), + mimetype(mimetype), + content(content) + {} + + bool isRedirect() const { return false; } + zim::writer::Url getUrl() const { return url; } + std::string getTitle() const { return title; } + std::string getMimeType() const { return mimetype; } + Blob getData() const { return Blob(content.data(), content.size()); } + private: + std::string title; + std::string url; + std::string mimetype; + std::string content; + }; + + int main() { + MyCreator creator(); + creator.startZimCreation("out_file.zim"); + std::shared_ptr article = std::make_shared("A article", "A/article", "text/html", "A content"); + creator.addArticle(article); + std::shared_ptr redirect = std::make_shared("A redirect", "A/redirect", "A/article"); + creator.addArticle(redirect); + creator.finishZimCreation(); + } + +On libzim7, you don't have to inherit the |Creator|. +Redirect and metadata are added using |addRedirection| and |addMetadata|. +You still may have to inherit |WriterItem| but default implementation +are provided (|StringItem|, |FileItem|). + + .. code-block:: c++ + + int main() { + zim::writer::Creator creator; + creator.startZimCreation(); + creator.addRedirection("A/redirect", "A redirect", "A/article"); + std::shared_ptr item = std::make_shared("article", "text/html", "A article", {}, "A content"); + creator.addItem(item); + creator.finishZimCreation(); + } + +Metadata and Illustration +......................... + +Metadata are adding using |addMetadata|. +You don't have to create a specific item in ``M`` namespace. + +The creator now create the ``M/Counter`` metadata for you. You don't have (and must not) add a ``M/Counter`` yourself. + +Favicon has been deprecated in favor of Illustration. +In libzim6, you had to add a file in ``I`` namespace and add a ``-/favicon`` redirection to the file. +In libzim7, you have to use the |addIllustration| method. + + +Hints +..... + +Hints are a new concept in libzim7. +This is a generic way to pass information to the creator about how to handle item/redirection. + +An almost mandatory hint to pass is the hint ``FRONT_ARTICLE`` (|HintKeys|). +``FRONT_ARTICLE`` mark entry (item or redirection) as main article for the reader +(typically a html page in opposition to a resource file as css, js, ...). +Random and suggestion feature will search only in entries marked as ``FRONT_ARTICLE``. +If no entry are marked as ``FRONT_ARTICLE``, all entries will be used. + + .. Declare some replacement helpers + + .. |Archive| replace:: :class:`zim::Archive` + .. |EntryRange| replace:: :class:`zim::Archive::EntryRange` + .. |Entry| replace:: :class:`zim::Entry` + .. |Item| replace:: :class:`zim::Item` + .. |EntryNotFound| replace:: :class:`zim::EntryNotFound` + .. |Searcher| replace:: :class:`zim::Searcher` + .. |Search| replace:: :class:`zim::Search` + .. |Query| replace:: :class:`zim::Query` + .. |SearchResultSet| replace:: :class:`zim::SearchResultSet` + .. |SuggestionSearcher| replace:: :class:`zim::SuggestionSearcher` + .. |getEntryByPath| replace:: :func:`getEntryByPath` + .. |getEntryByTitle| replace:: :func:`getEntryByTitle` + .. |findByPath| replace:: :func:`findByPath` + .. |findByTitle| replace:: :func:`findByTitle` + .. |Creator| replace:: :class:`zim::writer::Creator` + .. |WriterItem| replace:: :class:`zim::writer::Item` + .. |StringItem| replace:: :class:`zim::writer::StringItem` + .. |FileItem| replace:: :class:`zim::writer::FileItem` + .. |addMetadata| replace:: :func:`addMetadata` + .. |addRedirection| replace:: :func:`addRedirection` + .. |addIllustration| replace:: :func:`addIllustration` + .. |HintKeys| replace:: :enum:`zim::writer::HintKeys` diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..ab420ab --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,72 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = 'libzim' +copyright = '2020, libzim-team' +author = 'libzim-team' + + +# -- General configuration --------------------------------------------------- + +on_rtd = os.environ.get('READTHEDOCS', None) == 'True' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'breathe', + 'exhale' +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + +if not on_rtd: + html_theme = 'sphinx_rtd_theme' + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +breathe_projects = { + "libzim": "./xml" +} +breathe_default_project = 'libzim' + +exhale_args = { + "containmentFolder": "./api", + "rootFileName": "ref_api.rst", + "rootFileTitle": "Reference API", + "doxygenStripFromPath": "..", + "treeViewIsBootstrap": True, + "createTreeView" : True, + "exhaleExecutesDoxygen": True, + "exhaleDoxygenStdin": "INPUT = ../include" +} + +primary_domain = 'cpp' + +highlight_language = 'cpp' diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..a2412dd --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,15 @@ +.. libzim documentation master file, created by + sphinx-quickstart on Fri Jul 24 15:40:50 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to libzim's documentation! +================================== + +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + usage + 6to7 + api/ref_api diff --git a/docs/meson.build b/docs/meson.build new file mode 100644 index 0000000..71d4fc5 --- /dev/null +++ b/docs/meson.build @@ -0,0 +1,7 @@ + +sphinx = find_program('sphinx-build', native:true) + +sphinx_target = run_target('doc', + command: [sphinx, '-bhtml', + meson.current_source_dir(), + meson.current_build_dir()]) diff --git a/docs/requirements.txt b/docs/requirements.txt new file mode 100644 index 0000000..d805c25 --- /dev/null +++ b/docs/requirements.txt @@ -0,0 +1,3 @@ +breathe +exhale +sphinx<4 diff --git a/docs/usage.rst b/docs/usage.rst new file mode 100644 index 0000000..d4d4d1f --- /dev/null +++ b/docs/usage.rst @@ -0,0 +1,168 @@ +Libzim programming +================== + +Introduction +------------ + +libzim is written in C++. To use the library, you need the include files of libzim have +to link against libzim. + +Errors are handled with exceptions. When something goes wrong, libzim throws an error, +which is always derived from std::exception. + +All classes are defined in the namespace zim. +Copying is allowed and tried to make as cheap as possible. +The reading part of the libzim is most of the time thread safe. +Searching and creating part are not. You have to serialize access to the class yourself. + +The main class, which accesses a archive is |Archive|. +It has actually a reference to an implementation, so that copies of the class just references the same file. +You open a file by passing the file name to the constructor as a std::string. + +Iterating over entries is made by creating a |EntryRange|. + +.. code-block:: c++ + + #include + #include + #include + int main(int argc, char* argv[]) + { + try + { + zim::Archive a("wikipedia.zim"); + + for (auto entry: a.iterByPath()) { + std::cout << "path: " << entry.getPath() << " title: " << entry.getTitle() << std::endl; + } + } catch (const std::exception& e) { + std::cerr << e.what() << std::endl; + } + } + +In subsequent examples, only code needed to use the library will be explained. +The main-function with the error catcher should always be in place. + +Getting entries +--------------- + +Entries are addressed either by path or title. + +|Archive| has methods |getEntryByPath| and |getEntryByTitle|. Both take 1 parameters : a string, which specifies the path or the title of the entry to get. +They return a |Entry|. +If the entry cannot be found, they throw the exception |EntryNotFound|. + +Entry are entry point in a archive for "things". It can be a redirection to another entry or a |Item| + + .. code-block:: c++ + + auto entry = archive.getEntryByPath("foo"); + if (entry.isRedirect()) { + std::cout << "This is a redirection to " << entry.getRedirectEntry().getPath() << std::endl(); + } else { + std::cout << "This is a item with content : " << entry.getItem().getData() << std::endl(); + } + +As it is pretty common to resolve potential entry redirection and get the final item, you can do it directly using `getItem` : + + .. code-block:: c++ + + auto entry = archive.getEntryByPath("foo"); + auto item = entry.getItem(true); + if (entry.isRedirect()) { + std::cout << "Entry " << entry.getPath() << " is a entry pointing to the item " << item.getPath() << std::endl; + } else { + std::cout << entry.getPath() << " should be equal to " << item.getPath() << std::endl; + } + std::cout << "The item data is " << item.getData() << std::endl; + +Finding entries +--------------- + +|getEntryByPath|/|getEntryByTitle| allow to get a exact entry. +But you may want to find entries using a more loosely method. +|findByPath| and |findByTitle| allow you to find entries starting by the given path/title prefix. + +|findByPath|/|findByTitle| return a |EntryRange| you can iterate on : + + .. code-block:: c++ + + for (auto entry: archive.findEntryByPath("fo")) { + std::cout << "Entry " << entry.getPath() << " should starts with fo." << std::endl; + } + +Searching for entries +--------------------- + +Find entries by path/title is nice but you may want to search for entries base on their content. +If the zim archive contains a full text index, you can search on it. + +The class |Searcher| allow to search on one or several |Archive|. +It allows to create a |Search| which represent a particular search for a |Query|. +From a |Search|, you can get a |SearchResultSet| on which you can iterate. + + .. code-block:: c++ + + // Create a searcher, something to search on an archive + zim::Searcher searcher(archive); + + // We need a query to specify what to search for + zim::Query query; + query.setQuery("bar"); + + // Create a search for the specified query + zim::Search search = searcher.search(query); + + // Now we can get some result from the search. + // 20 results starting from offset 10 (from 10 to 30) + zim::SearchResultSet results = search.getResults(10, 20); + + // SearchResultSet is iterable + for(auto entry: results) { + std::cout << entry.getPath() << std::endl; + } + +Searching for suggestions +------------------------- + +While |findByTitle| may be a good start to search for suggestion, you may want to search for suggestion for term +in the middle of the suggestion. + +The suggestion API allow you to search for suggestion, using suggestion database included in recent zim files. +The suggestion API is pretty close from the search API: + + .. code-block:: c++ + + // Create a searcher, something to search on an archive + zim::SuggestionSearcher searcher(archive); + + // Create a search for the specified query + zim::SuggestionSearch search = searcher.search("bar"); + + // Now we can get some result from the search. + // 20 results starting from offset 10 (from 10 to 30) + zim::SuggestionResultSet results = search.getResults(10, 20); + + // SearchResultSet is iterable + for(auto entry: results) { + std::cout << entry.getPath() << std::endl; + } + +If the zim file doesn't contain a suggestion database, the suggestion will fallback to |findByTitle| for you. + + .. Declare some replacement helpers + + .. |Archive| replace:: :class:`zim::Archive` + .. |EntryRange| replace:: :class:`zim::Archive::EntryRange` + .. |Entry| replace:: :class:`zim::Entry` + .. |Item| replace:: :class:`zim::Item` + .. |EntryNotFound| replace:: :class:`zim::EntryNotFound` + .. |Searcher| replace:: :class:`zim::Searcher` + .. |Search| replace:: :class:`zim::Search` + .. |Query| replace:: :class:`zim::Query` + .. |SearchResultSet| replace:: :class:`zim::SearchResultSet` + .. |getEntryByPath| replace:: :func:`getEntryByPath` + .. |getEntryByTitle| replace:: :func:`getEntryByTitle` + .. |findByPath| replace:: :func:`findByPath` + .. |findByTitle| replace:: :func:`findByTitle` + diff --git a/examples/createZimExample.cpp b/examples/createZimExample.cpp new file mode 100644 index 0000000..998bff1 --- /dev/null +++ b/examples/createZimExample.cpp @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2012 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include + +#include +#include +#include + +class TestItem : public zim::writer::Item +{ + std::string _id; + std::string _data; + + public: + TestItem() { } + explicit TestItem(const std::string& id); + virtual ~TestItem() = default; + + virtual std::string getPath() const; + virtual std::string getTitle() const; + virtual std::string getMimeType() const; + + virtual std::unique_ptr getContentProvider() const; +}; + +TestItem::TestItem(const std::string& id) + : _id(id) +{ + std::ostringstream data; + data << "this is item " << id << std::endl; + _data = data.str(); +} + +std::string TestItem::getPath() const +{ + return std::string("A/") + _id; +} + +std::string TestItem::getTitle() const +{ + return _id; +} + +std::string TestItem::getMimeType() const +{ + return "text/plain"; +} + +std::unique_ptr TestItem::getContentProvider() const +{ + return std::unique_ptr(new zim::writer::StringProvider(_data)); +} + +int main(int argc, char* argv[]) +{ + unsigned max = 16; + try { + zim::writer::Creator c; + c.configVerbose(false).configCompression(zim::Compression::Zstd); + c.startZimCreation("foo.zim"); + for (unsigned n = 0; n < max; ++n) + { + std::ostringstream id; + id << (n + 1); + auto article = std::make_shared(id.str()); + c.addItem(article); + } + c.setMainPath("A/0"); + c.finishZimCreation(); + } + catch (const std::exception& e) + { + std::cerr << e.what() << std::endl; + } +} + diff --git a/examples/meson.build b/examples/meson.build new file mode 100644 index 0000000..3b804e4 --- /dev/null +++ b/examples/meson.build @@ -0,0 +1,6 @@ + +executable('createZimExample', 'createZimExample.cpp', + link_with: libzim, + link_args: extra_link_args, + include_directories: include_directory, + dependencies: [thread_dep, xapian_dep, icu_dep, lzma_dep, zstd_dep]) diff --git a/include/meson.build b/include/meson.build new file mode 100644 index 0000000..a08d9e6 --- /dev/null +++ b/include/meson.build @@ -0,0 +1,3 @@ +subdir('zim') + +include_directory = include_directories('.') diff --git a/include/zim/archive.h b/include/zim/archive.h new file mode 100644 index 0000000..2650d08 --- /dev/null +++ b/include/zim/archive.h @@ -0,0 +1,634 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ARCHIVE_H +#define ZIM_ARCHIVE_H + +#include "zim.h" +#include "entry.h" +#include "uuid.h" + +#include +#include +#include +#include +#include + +namespace zim +{ + class FileImpl; + + enum class EntryOrder { + pathOrder, + titleOrder, + efficientOrder + }; + + /** + * The Archive class to access content in a zim file. + * + * The `Archive` is the main class to access content in a zim file. + * `Archive` are lightweight object and can be copied easily. + * + * An `Archive` is read-only, and internal states (as caches) are protected + * from race-condition. Therefore, all methods of `Archive` are threadsafe. + * + * All methods of archive may throw an `ZimFileFormatError` if the file is invalid. + */ + class Archive + { + public: + template class EntryRange; + template class iterator; + + /** Archive constructor. + * + * Construct an archive from a filename. + * The file is open readonly. + * + * The filename is the "logical" path. + * So if you want to open a split zim file (foo.zimaa, foo.zimab, ...) + * you must pass the `foo.zim` path. + * + * @param fname The filename to the file to open (utf8 encoded) + */ + explicit Archive(const std::string& fname); + +#ifndef _WIN32 + /** Archive constructor. + * + * Construct an archive from a file descriptor. + * + * Note: This function is not available under Windows. + * + * @param fd The descriptor of a seekable file representing a ZIM archive + */ + explicit Archive(int fd); + + /** Archive constructor. + * + * Construct an archive from a descriptor of a file with an embedded ZIM + * archive inside. + * + * Note: This function is not available under Windows. + * + * @param fd The descriptor of a seekable file with a continuous segment + * representing a complete ZIM archive. + * @param offset The offset of the ZIM archive relative to the beginning + * of the file (rather than the current position associated with fd). + * @param size The size of the ZIM archive. + */ + Archive(int fd, offset_type offset, size_type size); +#endif + + /** Return the filename of the zim file. + * + * Return the filename as passed to the constructor + * (So foo.zim). + * + * @return The logical filename of the archive. + */ + const std::string& getFilename() const; + + /** Return the logical archive size. + * + * Return the size of the full archive, not the size of the file on the fs. + * If the zim is split, return the sum of the size of the parts. + * + * @return The logical size of the archive. + */ + size_type getFilesize() const; + + /** Return the number of entries in the archive. + * + * Return the total number of entries in the archive, including + * internal entries created by libzim itself, metadata, indexes, ... + * + * @return the number of all entries in the archive. + */ + entry_index_type getAllEntryCount() const; + + /** Return the number of user entries in the archive. + * + * If the notion of "user entries" doesn't exist in the zim archive, + * returns `getAllEntryCount()`. + * + * @return the number of user entries in the archive. + */ + entry_index_type getEntryCount() const; + + /** Return the number of articles in the archive. + * + * The definition of "article" depends of the zim archive. + * On recent archives, this correspond to all entries marked as "FRONT_ARTICLE" + * at creaton time. + * On old archives, this correspond to all entries in 'A' namespace. + * Few archives may have been created without namespace but also without specific + * article listing. In this case, articles are all user entries. + * + * @return the number of articles in the archive. + */ + entry_index_type getArticleCount() const; + + /** The uuid of the archive. + * + * @return the uuid of the archive. + */ + Uuid getUuid() const; + + /** Get a specific metadata content. + * + * Get the content of a metadata stored in the archive. + * + * @param name The name of the metadata. + * @return The content of the metadata. + * @exception EntryNotFound If the metadata is not in the arcthive. + */ + std::string getMetadata(const std::string& name) const; + + /** Get a specific metadata item. + * + * Get the item associated to a metadata stored in the archive. + * + * @param name The name of the metadata. + * @return The item associated to the metadata. + * @exception EntryNotFound If the metadata in not in the archive. + */ + Item getMetadataItem(const std::string& name) const; + + /** Get the list of metadata stored in the archive. + * + * @return The list of metadata in the archive. + */ + std::vector getMetadataKeys() const; + + /** Get the illustration item of the archive. + * + * Illustration is a icon for the archive that can be used in catalog and so to illustrate the archive. + * + * @param size The size (width and height) of the illustration to get. Default to 48 (48x48px icon) + * @return The illustration item. + * @exception EntryNotFound If no illustration item can be found. + */ + Item getIllustrationItem(unsigned int size=48) const; + + /** Return a list of available sizes (width) for the illustations in the archive. + * + * Illustration is an icon for the archive that can be used in catalog and elsewehere to illustrate the archive. + * An Archive may contains several illustrations with different size. + * This method allows to know which illustration are in the archive (by size: width) + * + * @return A set of size. + */ + std::set getIllustrationSizes() const; + + + /** Get an entry using its "path" index. + * + * Use the index of the entry to get the idx'th entry + * (entry being sorted by path). + * + * @param idx The index of the entry. + * @return The Entry. + * @exception std::out_of_range If idx is greater than the number of entry. + */ + Entry getEntryByPath(entry_index_type idx) const; + + /** Get an entry using a path. + * + * Get an entry using its path. + * The path must contains the namespace. + * + * @param path The entry's path. + * @return The Entry. + * @exception EntryNotFound If no entry has the asked path. + */ + Entry getEntryByPath(const std::string& path) const; + + /** Get an entry using its "title" index. + * + * Use the index of the entry to get the idx'th entry + * (entry being sorted by title). + * + * @param idx The index of the entry. + * @return The Entry. + * @exception std::out_of_range If idx is greater than the number of entry. + */ + Entry getEntryByTitle(entry_index_type idx) const; + + /** Get an entry using a title. + * + * Get an entry using its path. + * + * @param title The entry's title. + * @return The Entry. + * @exception EntryNotFound If no entry has the asked title. + */ + Entry getEntryByTitle(const std::string& title) const; + + /** Get an entry using its "cluster" index. + * + * Use the index of the entry to get the idx'th entry + * The actual order of the entry is not really specified. + * It is infered from the internal way the entry are stored. + * + * This method is probably not relevent and is provided for completeness. + * You should probably use a iterator using the `efficientOrder`. + * + * @param idx The index of the entry. + * @return The Entry. + * @exception std::out_of_range If idx is greater than the number of entry. + */ + Entry getEntryByClusterOrder(entry_index_type idx) const; + + /** Get the main entry of the archive. + * + * @return The Main entry. + * @exception EntryNotFound If no main entry has been specified in the archive. + */ + Entry getMainEntry() const; + + /** Get a random entry. + * + * The entry is picked randomly from the front artice list. + * + * @return A random entry. + * @exception EntryNotFound If no valid random entry can be found. + */ + Entry getRandomEntry() const; + + /** Check in an entry has path in the archive. + * + * @param path The entry's path. + * @return True if the path in the archive, false else. + */ + bool hasEntryByPath(const std::string& path) const { + try{ + getEntryByPath(path); + return true; + } catch(...) { return false; } + } + + /** Check in an entry has title in the archive. + * + * @param title The entry's title. + * @return True if the title in the archive, false else. + */ + bool hasEntryByTitle(const std::string& title) const { + try{ + getEntryByTitle(title); + return true; + } catch(...) { return false; } + } + + /** Check if archive has a main entry + * + * @return True if the archive has a main entry. + */ + bool hasMainEntry() const; + + /** Check if archive has a favicon entry + * + * @param size The size (width and height) of the illustration to check. Default to 48 (48x48px icon) + * @return True if the archive has a corresponding illustration entry. + * (Always True if the archive has no illustration, but a favicon) + */ + bool hasIllustration(unsigned int size=48) const; + + /** Check if the archive has a fulltext index. + * + * @return True if the archive has a fulltext index + */ + bool hasFulltextIndex() const; + + /** Check if the archive has a title index. + * + * @return True if the archive has a title index + */ + bool hasTitleIndex() const; + + + /** Get a "iterable" by path order. + * + * This method allow to iterate on all user entries using a path order. + * If the notion of "user entries" doesn't exists (for old zim archive), + * this iterate on all entries in the zim file. + * + * ``` + * for(auto& entry:archive.iterByPath()) { + * ... + * } + * ``` + * + * @return A range on all the entries, in path order. + */ + EntryRange iterByPath() const; + + /** Get a "iterable" by title order. + * + * This method allow to iterate on all articles using a title order. + * The definition of "article" depends of the zim archive. + * On recent archives, this correspond to all entries marked as "FRONT_ARTICLE" + * at creaton time. + * On old archives, this correspond to all entries in 'A' namespace. + * Few archives may have been created without namespace but also without specific + * article listing. In this case, this iterate on all user entries. + * + * ``` + * for(auto& entry:archive.iterByTitle()) { + * ... + * } + * ``` + * + * @return A range on all the entries, in title order. + */ + EntryRange iterByTitle() const; + + /** Get a "iterable" by a efficient order. + * + * This method allow to iterate on all user entries using a effictient order. + * If the notion of "user entries" doesn't exists (for old zim archive), + * this iterate on all entries in the zim file. + * + * ``` + * for(auto& entry:archive.iterEfficient()) { + * ... + * } + * ``` + * + * @return A range on all the entries, in efficitent order. + */ + EntryRange iterEfficient() const; + + /** Find a range of entries starting with path. + * + * The path is the "long path". (Ie, with the namespace) + * + * @param path The path prefix to search for. + * @return A range starting from the first entry starting with path + * and ending past the last entry. + * If no entry starts with `path`, begin == end. + */ + EntryRange findByPath(std::string path) const; + + /** Find a range of entry starting with title. + * + * The entry title is search in `A` namespace. + * + * @param title The title prefix to search for. + * @return A range starting from the first entry starting with title + * and ending past the last entry. + * If no entry starts with `title`, begin == end. + */ + EntryRange findByTitle(std::string title) const; + + /** hasChecksum. + * + * The checksum is not the checksum of the file. + * It is an internal checksum stored in the zim file. + * + * @return True if the archive has a checksum. + */ + bool hasChecksum() const; + + /** getChecksum. + * + * @return the checksum stored in the archive. + * If the archive has no checksum return an empty string. + */ + std::string getChecksum() const; + + /** Check that the zim file is valid (in regard to its checksum). + * + * If the zim file has no checksum return false. + * + * @return True if the file is valid. + */ + bool check() const; + + /** Check the integrity of the zim file. + * + * Run different type of checks to verify the zim file is valid + * (in regard to the zim format). + * This may be time consuming. + * + * @return True if the file is valid. + */ + bool checkIntegrity(IntegrityCheck checkType); + + /** Check if the file is split in the filesystem. + * + * @return True if the archive is split in different file (foo.zimaa, foo.zimbb). + */ + bool isMultiPart() const; + + /** Get if the zim archive uses the new namespace scheme. + * + * Recent zim file use the new namespace scheme. + * + * On user perspective, it means that : + * - On old namespace scheme : + * . All entries are accessible, either using `getEntryByPath` with a specific namespace + * or simply iterating over the entries (with `iter*` methods). + * . Entry's path has namespace included ("A/foo.html") + * - On new namespace scheme : + * . Only the "user" entries are accessible with `getEntryByPath` and `iter*` methods. + * To access metadatas, use `getMetadata` method. + * . Entry's path do not contains namespace ("foo.html") + */ + bool hasNewNamespaceScheme() const; + + /** Get a shared ptr on the FileImpl + * + * @internal + * @return The shared_ptr + */ + std::shared_ptr getImpl() const { return m_impl; } + +#ifdef ZIM_PRIVATE + cluster_index_type getClusterCount() const; + offset_type getClusterOffset(cluster_index_type idx) const; + entry_index_type getMainEntryIndex() const; +#endif + + private: + std::shared_ptr m_impl; + }; + + template + entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); + + template<> + entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); + template<> + entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); + template<> + entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx); + + + /** + * A range of entries in an `Archive`. + * + * `EntryRange` represents a range of entries in a specific order. + * + * An `EntryRange` can't be modified is consequently threadsafe. + */ + template + class Archive::EntryRange { + public: + explicit EntryRange(const std::shared_ptr file, entry_index_type begin, entry_index_type end) + : m_file(file), + m_begin(begin), + m_end(end) + {} + + iterator begin() const + { return iterator(m_file, entry_index_type(m_begin)); } + iterator end() const + { return iterator(m_file, entry_index_type(m_end)); } + int size() const + { return m_end - m_begin; } + + EntryRange offset(int start, int maxResults) const + { + auto begin = m_begin + start; + if (begin > m_end) { + begin = m_end; + } + auto end = m_end; + if (begin + maxResults < end) { + end = begin + maxResults; + } + return EntryRange(m_file, begin, end); + } + +private: + std::shared_ptr m_file; + entry_index_type m_begin; + entry_index_type m_end; + }; + + /** + * An iterator on an `Archive`. + * + * `Archive::iterator` stores an internal state which is not protected + * from race-condition. It is not threadsafe. + * + * An `EntryRange` can't be modified and is consequently threadsafe. + */ + template + class Archive::iterator : public std::iterator + { + public: + explicit iterator(const std::shared_ptr file, entry_index_type idx) + : m_file(file), + m_idx(idx), + m_entry(nullptr) + {} + + iterator(const iterator& other) + : m_file(other.m_file), + m_idx(other.m_idx), + m_entry(other.m_entry?new Entry(*other.m_entry):nullptr) + {} + + bool operator== (const iterator& it) const + { return m_file == it.m_file && m_idx == it.m_idx; } + bool operator!= (const iterator& it) const + { return !operator==(it); } + + iterator& operator=(iterator&& it) = default; + + iterator& operator=(iterator& it) + { + m_entry.reset(); + m_idx = it.m_idx; + m_file = it.m_file; + return *this; + } + + iterator& operator++() + { + ++m_idx; + m_entry.reset(); + return *this; + } + + iterator operator++(int) + { + auto it = *this; + operator++(); + return it; + } + + iterator& operator--() + { + --m_idx; + m_entry.reset(); + return *this; + } + + iterator operator--(int) + { + auto it = *this; + operator--(); + return it; + } + + const Entry& operator*() const + { + if (!m_entry) { + m_entry.reset(new Entry(m_file, _toPathOrder(*m_file, m_idx))); + } + return *m_entry; + } + + const Entry* operator->() const + { + operator*(); + return m_entry.get(); + } + + private: + std::shared_ptr m_file; + entry_index_type m_idx; + mutable std::unique_ptr m_entry; + }; + + /** + * The set of the integrity checks to be performed by `zim::validate()`. + */ + typedef std::bitset IntegrityCheckList; + + /** Check the integrity of the zim file. + * + * Run the specified checks to verify the zim file is valid + * (with regard to the zim format). Some checks can be quite slow. + * + * @param zimPath The path of the ZIM archive to be checked. + * @param checksToRun The set of checks to perform. + * @return False if any check fails, true otherwise. + */ + bool validate(const std::string& zimPath, IntegrityCheckList checksToRun); +} + +#endif // ZIM_ARCHIVE_H + diff --git a/include/zim/blob.h b/include/zim/blob.h new file mode 100644 index 0000000..2b0662d --- /dev/null +++ b/include/zim/blob.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_BLOB_H +#define ZIM_BLOB_H + +#include "zim.h" + +#include +#include +#include +#include + +namespace zim +{ + /** + * A blob is a pointer to data, potentially stored in an `Archive`. + * + * All `Blob`'s methods are threadsafe. + */ + class Blob + { + public: // types + using DataPtr = std::shared_ptr; + + public: // functions + /** + * Constuct a empty `Blob` + */ + Blob(); + + /** + * Constuct `Blob` pointing to `data`. + * + * The created blob only point to the data and doesn't own it. + * User must care that data is not freed before using the blob. + */ + Blob(const char* data, size_type size); + + /** + * Constuct `Blob` pointing to `data`. + * + * The created blob shares the ownership on data. + */ + Blob(const DataPtr& buffer, size_type size); + + operator std::string() const { return std::string(_data.get(), _size); } + const char* data() const { return _data.get(); } + const char* end() const { return _data.get() + _size; } + size_type size() const { return _size; } + + private: + DataPtr _data; + size_type _size; + }; + + inline std::ostream& operator<< (std::ostream& out, const Blob& blob) + { + if (blob.data()) + out.write(blob.data(), blob.size()); + return out; + } + + inline bool operator== (const Blob& b1, const Blob& b2) + { + return b1.size() == b2.size() + && std::equal(b1.data(), b1.data() + b1.size(), b2.data()); + } +} + +#endif // ZIM_BLOB_H diff --git a/include/zim/entry.h b/include/zim/entry.h new file mode 100644 index 0000000..6944aa6 --- /dev/null +++ b/include/zim/entry.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ENTRY_H +#define ZIM_ENTRY_H + +#include "zim.h" + +#include +#include + +namespace zim +{ + class Item; + class Dirent; + class FileImpl; + + /** + * An entry in an `Archive`. + * + * All `Entry`'s methods are threadsafe. + */ + class Entry + { + public: + explicit Entry(std::shared_ptr file_, entry_index_type idx_); + + bool isRedirect() const; + std::string getTitle() const; + std::string getPath() const; + + /** Get the item associated to the entry. + * + * An item is associated only if the entry is not a redirect. + * For convenience, if follow is true, return the item associated to the targeted entry. + * + * @param follow True if the redirection is resolved before getting the item. (false by default) + * @return The Item associated to the entry. + * @exception InvalidType if the entry is a redirection and follow is false. + */ + Item getItem(bool follow=false) const; + + /** Get the item associated to the target entry. + * + * If there is a chain of redirection, the whole chain is resolved + * and the item associted to the last entry is returned. + * + * @return the Item associated with the targeted entry. + * @exception InvalidType if the entry is not a redirection. + */ + Item getRedirect() const; + + /** Get the Entry targeted by the entry. + * + * @return The entry directly targeted by this redirect entry. + * @exception InvalidEntry if the entry is not a redirection. + */ + Entry getRedirectEntry() const; + + /** Get the index of the Entry targeted by the entry. + * + * @return The index of the entry directly targeted by this redirect + * entry. + * @exception InvalidEntry if the entry is not a redirection. + */ + entry_index_type getRedirectEntryIndex() const; + + entry_index_type getIndex() const { return m_idx; } + + private: + std::shared_ptr m_file; + entry_index_type m_idx; + std::shared_ptr m_dirent; + }; + +} + +#endif // ZIM_ENTRY_H + diff --git a/include/zim/error.h b/include/zim/error.h new file mode 100644 index 0000000..48acd09 --- /dev/null +++ b/include/zim/error.h @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ERROR_H +#define ZIM_ERROR_H + +#include + +namespace zim +{ + class ZimFileFormatError : public std::runtime_error + { + public: + explicit ZimFileFormatError(const std::string& msg) + : std::runtime_error(msg) + { } + }; + + class InvalidType: public std::logic_error + { + public: + explicit InvalidType(const std::string& msg) + : std::logic_error(msg) + {} + }; + + class EntryNotFound : public std::runtime_error + { + public: + explicit EntryNotFound(const std::string& msg) + : std::runtime_error(msg) + {} + }; +} + +#endif // ZIM_ERROR_H + diff --git a/include/zim/item.h b/include/zim/item.h new file mode 100644 index 0000000..765b78a --- /dev/null +++ b/include/zim/item.h @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2021 Veloman Yunkan + * Copyright (C) 2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ITEM_H +#define ZIM_ITEM_H + +#include "zim.h" +#include "blob.h" +#include + +namespace zim +{ + class Dirent; + class FileImpl; + + /** + * An `Item` in an `Archive` + * + * All `Item`'s methods are threadsafe. + */ + class Item + { + public: // types + typedef std::pair DirectAccessInfo; + + public: // functions + explicit Item(std::shared_ptr file_, entry_index_type idx_); + + std::string getTitle() const; + std::string getPath() const; + std::string getMimetype() const; + + /** Get the data associated to the item + * + * Get the data of the item, starting at offset. + * + * @param offset The number of byte to skip at begining of the data. + * @return A blob corresponding to the data. + */ + Blob getData(offset_type offset=0) const; + + /** Get the data associated to the item + * + * Get the `size` bytes of data of the item, starting at offset. + * + * @param offset The number of byte to skip at begining of the data. + * @param size The number of byte to read. + * @return A blob corresponding to the data. + */ + Blob getData(offset_type offset, size_type size) const; + + /** The size of the item. + * + * @return The size (in byte) of the item. + */ + size_type getSize() const; + + /** Direct access information. + * + * Some item are stored raw in the zim file. + * If possible, this function give information about which file + * and at which to read to get the data. + * + * It can be usefull as an optimisation when interacting with other system + * by reopeing the file and reading the content bypassing the libzim. + * + * @return A pair of filename/offset specifying where read the content. + * If it is not possible to have direct access for this item, + * return a pair of `{"", 0}` + */ + DirectAccessInfo getDirectAccessInformation() const; + + entry_index_type getIndex() const { return m_idx; } + +#ifdef ZIM_PRIVATE + cluster_index_type getClusterIndex() const; +#endif + + private: // data + std::shared_ptr m_file; + entry_index_type m_idx; + std::shared_ptr m_dirent; + }; + +} + +#endif // ZIM_ITEM_H + diff --git a/include/zim/meson.build b/include/zim/meson.build new file mode 100644 index 0000000..b8b6c49 --- /dev/null +++ b/include/zim/meson.build @@ -0,0 +1,34 @@ +zim_config = configure_file(output : 'zim_config.h', + configuration : public_conf) + +install_headers( + 'archive.h', + 'blob.h', + 'error.h', + 'item.h', + 'entry.h', + 'uuid.h', + 'zim.h', + 'suggestion.h', + 'suggestion_iterator.h', + 'tools.h', + 'version.h', + zim_config, + subdir:'zim' +) + +if xapian_dep.found() + install_headers( + 'search.h', + 'search_iterator.h', + subdir:'zim' + ) +endif + +install_headers( + 'writer/item.h', + 'writer/creator.h', + 'writer/contentProvider.h', + subdir:'zim/writer' +) + diff --git a/include/zim/search.h b/include/zim/search.h new file mode 100644 index 0000000..2b4cb66 --- /dev/null +++ b/include/zim/search.h @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2007 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_H +#define ZIM_SEARCH_H + +#include "search_iterator.h" +#include "archive.h" +#include +#include +#include + +namespace Xapian { + class Enquire; + class MSet; +}; + +namespace zim +{ + +class Archive; +class InternalDataBase; +class Query; +class Search; +class SearchResultSet; + +/** + * A Searcher is a object fulltext searching a set of Archives + * + * A Searcher is mainly used to create new `Search` + * Internaly, this is mainly a wrapper around a Xapian database. + * + * You should consider that all search operations are NOT threadsafe. + * It is up to you to protect your calls to avoid race competition. + * However, Searcher (and subsequent classes) do not maintain a global/share state. + * You can create several Searchers and use them in different threads. + */ +class Searcher +{ + public: + /** Searcher constructor. + * + * Construct a searcher on top of several archives (multi search). + * + * @param archives A list(vector) of archives to search on. + */ + explicit Searcher(const std::vector& archives); + + /** Searcher constructor. + * + * Construct a searcher on top of on archive. + * + * @param archive A archive to search on. + */ + explicit Searcher(const Archive& archive); + Searcher(const Searcher& other); + Searcher& operator=(const Searcher& other); + Searcher(Searcher&& other); + Searcher& operator=(Searcher&& other); + ~Searcher(); + + /** Add a archive to the searcher. + * + * Adding a archive to a searcher do not invalidate already created search. + */ + Searcher& addArchive(const Archive& archive); + + /** Create a search for a specific query. + * + * The search is made on all archives added to the Searcher. + * + * @param query The Query to search. + * + * @throws std::runtime_error if the searcher does not have a valid + * FT database. + */ + Search search(const Query& query); + + /** Set the verbosity of search operations. + * + * @param verbose The verbose mode to set + */ + void setVerbose(bool verbose); + + private: // methods + void initDatabase(); + + private: // data + std::shared_ptr mp_internalDb; + std::vector m_archives; + bool m_verbose; +}; + +/** + * A Query represent a query. + * + * It describe what have to be searched and how. + * A Query is "database" independent. + */ +class Query +{ + public: + /** Query constructor. + * + * Create a empty query. + */ + Query(const std::string& query = ""); + + /** Set the textual query of the Query. + * + * @param query The string to search for. + */ + Query& setQuery(const std::string& query); + + /** Set the geographical query of the Query. + * + * Some article may be geo positioned. + * You can search for articles in a certain distance of a point. + * + * @param latitude The latitute of the point. + * @param longitude The longitude of the point. + * @param distance The maximal distance from the point. + */ + Query& setGeorange(float latitude, float longitude, float distance); + + std::string m_query { "" }; + + bool m_geoquery { false }; + float m_latitude { 0 }; + float m_longitude { 0 }; + float m_distance { 0 } ; +}; + + +/** + * A Search represent a particular search, based on a `Searcher`. + * + * This is somehow the reunification of a `Searcher` (what to search on) + * and a `Query` (what to search for). + */ +class Search +{ + public: + Search(Search&& s); + Search& operator=(Search&& s); + ~Search(); + + /** Get a set of results for this search. + * + * @param start The begining of the range to get + * (offset of the first result). + * @param maxResults The maximum number of results to return + * (offset of last result from the start of range). + */ + const SearchResultSet getResults(int start, int maxResults) const; + + /** Get the number of estimated results for this search. + * + * As the name suggest, it is a estimation of the number of results. + */ + int getEstimatedMatches() const; + + private: // methods + Search(std::shared_ptr p_internalDb, const Query& query); + Xapian::Enquire& getEnquire() const; + + private: // data + std::shared_ptr mp_internalDb; + mutable std::unique_ptr mp_enquire; + Query m_query; + + friend class Searcher; +}; + +/** + * The `SearchResult` represent a range of results corresponding to a `Search`. + * + * It mainly allows to get a iterator. + */ +class SearchResultSet +{ + public: + typedef SearchIterator iterator; + + /** The begin iterator on the result range. */ + iterator begin() const; + + /** The end iterator on the result range. */ + iterator end() const; + + /** The size of the SearchResult (end()-begin()) */ + int size() const; + + private: + SearchResultSet(std::shared_ptr p_internalDb, Xapian::MSet&& mset); + SearchResultSet(std::shared_ptr p_internalDb); + + private: // data + std::shared_ptr mp_internalDb; + std::shared_ptr mp_mset; + friend class Search; +}; + +} //namespace zim + +#endif // ZIM_SEARCH_H diff --git a/include/zim/search_iterator.h b/include/zim/search_iterator.h new file mode 100644 index 0000000..a8c98f0 --- /dev/null +++ b/include/zim/search_iterator.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2020 Matthieu Gautier + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_ITERATOR_H +#define ZIM_SEARCH_ITERATOR_H + +#include +#include +#include "entry.h" +#include "archive.h" +#include "uuid.h" + +namespace zim +{ +class SearchResultSet; + +class SearchIterator : public std::iterator +{ + friend class zim::SearchResultSet; + public: + SearchIterator(); + SearchIterator(const SearchIterator& it); + SearchIterator& operator=(const SearchIterator& it); + SearchIterator(SearchIterator&& it); + SearchIterator& operator=(SearchIterator&& it); + ~SearchIterator(); + + bool operator== (const SearchIterator& it) const; + bool operator!= (const SearchIterator& it) const; + + SearchIterator& operator++(); + SearchIterator operator++(int); + SearchIterator& operator--(); + SearchIterator operator--(int); + + std::string getPath() const; + std::string getTitle() const; + int getScore() const; + std::string getSnippet() const; + int getWordCount() const; + int getSize() const; + int getFileIndex() const; + Uuid getZimId() const; + reference operator*() const; + pointer operator->() const; + +#ifdef ZIM_PRIVATE + std::string getDbData() const; +#endif + + private: + struct InternalData; + std::unique_ptr internal; + SearchIterator(InternalData* internal_data); + + bool isEnd() const; +}; + +} // namespace zim + +#endif // ZIM_SEARCH_ITERATOR_H diff --git a/include/zim/suggestion.h b/include/zim/suggestion.h new file mode 100644 index 0000000..c8020ad --- /dev/null +++ b/include/zim/suggestion.h @@ -0,0 +1,188 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2007 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SUGGESTION_H +#define ZIM_SUGGESTION_H + +#include "suggestion_iterator.h" +#include "archive.h" + +#if defined(LIBZIM_WITH_XAPIAN) +namespace Xapian { + class Enquire; + class MSet; +}; +#endif + +namespace zim +{ + +class SuggestionSearcher; +class SuggestionSearch; +class SuggestionIterator; +class SuggestionDataBase; + +/** + * A SuggestionSearcher is a object suggesting over titles of an Archive + * + * A SuggestionSearcher is mainly used to create new `SuggestionSearch` + * Internaly, this is a wrapper around a SuggestionDataBase with may or may not + * include a Xapian index. + * + * You should consider that all search operations are NOT threadsafe. + * It is up to you to protect your calls to avoid race competition. + * However, SuggestionSearcher (and subsequent classes) do not maintain a global/ + * share state You can create several Searchers and use them in different threads. + */ +class SuggestionSearcher +{ + public: + /** SuggestionSearcher constructor. + * + * Construct a SuggestionSearcher on top of an archive. + * + * @param archive An archive to suggest on. + */ + explicit SuggestionSearcher(const Archive& archive); + + SuggestionSearcher(const SuggestionSearcher& other); + SuggestionSearcher& operator=(const SuggestionSearcher& other); + SuggestionSearcher(SuggestionSearcher&& other); + SuggestionSearcher& operator=(SuggestionSearcher&& other); + ~SuggestionSearcher(); + + /** Create a SuggestionSearch for a specific query. + * + * The search is made on the archive under the SuggestionSearcher. + * + * @param query The SuggestionQuery to search. + */ + SuggestionSearch suggest(const std::string& query); + + /** Set the verbosity of search operations. + * + * @param verbose The verbose mode to set + */ + void setVerbose(bool verbose); + + private: // methods + void initDatabase(); + + private: // data + std::shared_ptr mp_internalDb; + Archive m_archive; + bool m_verbose; +}; + +/** + * A SuggestionSearch represent a particular suggestion search, based on a `SuggestionSearcher`. + */ +class SuggestionSearch +{ + public: + SuggestionSearch(SuggestionSearch&& s); + SuggestionSearch& operator=(SuggestionSearch&& s); + ~SuggestionSearch(); + + /** Get a set of results for this search. + * + * @param start The begining of the range to get + * (offset of the first result). + * @param maxResults The maximum number of results to return + * (offset of last result from the start of range). + */ + const SuggestionResultSet getResults(int start, int maxResults) const; + + /** Get the number of estimated results for this suggestion search. + * + * As the name suggest, it is a estimation of the number of results. + */ + int getEstimatedMatches() const; + + private: // methods + SuggestionSearch(std::shared_ptr p_internalDb, const std::string& query); + + private: // data + std::shared_ptr mp_internalDb; + std::string m_query; + + friend class SuggestionSearcher; + +#ifdef ZIM_PRIVATE + public: + // Close Xapian db to force range based search + const void forceRangeSuggestion(); +#endif + +// Xapian based methods and data +#if defined(LIBZIM_WITH_XAPIAN) + private: // Xapian based methods + Xapian::Enquire& getEnquire() const; + + private: // Xapian based data + mutable std::unique_ptr mp_enquire; +#endif // LIBZIM_WITH_XAPIAN +}; + +/** + * The `SuggestionResultSet` represent a range of results corresponding to a `SuggestionSearch`. + * + * It mainly allows to get a iterator either based on an MSetIterator or a RangeIterator. + */ +class SuggestionResultSet +{ + public: + typedef SuggestionIterator iterator; + typedef Archive::EntryRange EntryRange; + + /** The begin iterator on the result range. */ + iterator begin() const; + + /** The end iterator on the result range. */ + iterator end() const; + + /** The size of the SearchResult (end()-begin()) */ + int size() const; + + private: // data + std::shared_ptr mp_internalDb; + std::shared_ptr mp_entryRange; + + private: + SuggestionResultSet(EntryRange entryRange); + + friend class SuggestionSearch; + +// Xapian based methods and data +#if defined(LIBZIM_WITH_XAPIAN) + + private: // Xapian based methods + SuggestionResultSet(std::shared_ptr p_internalDb, Xapian::MSet&& mset); + + private: // Xapian based data + std::shared_ptr mp_mset; + +#endif // LIBZIM_WITH_XAPIAN +}; + +} // namespace zim + +#endif // ZIM_SUGGESTION_H diff --git a/include/zim/suggestion_iterator.h b/include/zim/suggestion_iterator.h new file mode 100644 index 0000000..ec2f890 --- /dev/null +++ b/include/zim/suggestion_iterator.h @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2020 Matthieu Gautier + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SUGGESTION_ITERATOR_H +#define ZIM_SUGGESTION_ITERATOR_H + +#include "archive.h" +#include + +namespace zim +{ +class SuggestionResultSet; +class SuggestionItem; +class SearchIterator; + +class SuggestionIterator : public std::iterator +{ + typedef Archive::iterator RangeIterator; + friend class SuggestionResultSet; + public: + SuggestionIterator() = delete; + SuggestionIterator(const SuggestionIterator& it); + SuggestionIterator& operator=(const SuggestionIterator& it); + SuggestionIterator(SuggestionIterator&& it); + SuggestionIterator& operator=(SuggestionIterator&& it); + ~SuggestionIterator(); + + bool operator== (const SuggestionIterator& it) const; + bool operator!= (const SuggestionIterator& it) const; + + SuggestionIterator& operator++(); + SuggestionIterator operator++(int); + SuggestionIterator& operator--(); + SuggestionIterator operator--(int); + + Entry getEntry() const; + + const SuggestionItem& operator*(); + const SuggestionItem* operator->(); + + private: // data + struct SuggestionInternalData; + std::unique_ptr mp_rangeIterator; + std::unique_ptr m_suggestionItem; + + private: // methods + SuggestionIterator(RangeIterator rangeIterator); + +// Xapian based methods and data +#if defined(LIBZIM_WITH_XAPIAN) +#ifdef ZIM_PRIVATE + public: + std::string getDbData() const; +#endif + private: // xapian based data + std::unique_ptr mp_internal; + + private: // xapian based methods + std::string getIndexPath() const; + std::string getIndexTitle() const; + std::string getIndexSnippet() const; + SuggestionIterator(SuggestionInternalData* internal_data); +#endif // LIBZIM_WITH_XAPIAN +}; + +class SuggestionItem +{ + public: // methods + std::string getTitle() const { return title; } + std::string getPath() const { return path; } + std::string getSnippet() const { return snippet; } + + bool hasSnippet() const { return !snippet.empty(); } + + private: // data + std::string title; + std::string path; + std::string snippet; + + private: // methods + explicit SuggestionItem(std::string title, std::string path, std::string snippet = "") + : title(title), + path(path), + snippet(snippet) {} + + friend class SuggestionIterator; +}; + +} // namespace zim + +#endif // ZIM_SUGGESTION_ITERATOR_H diff --git a/include/zim/tools.h b/include/zim/tools.h new file mode 100644 index 0000000..80cb125 --- /dev/null +++ b/include/zim/tools.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2022 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_TOOLS_H +#define ZIM_TOOLS_H + +#include + + +namespace zim { +#if defined(ENABLE_XAPIAN) + + /** Helper function to set the icu data directory. + * + * On Android, we compile ICU without data integrated + * in the library. So android application needs to set + * the data directory where ICU can find its data. + */ + void setICUDataDirectory(const std::string& path); + +#endif +} + +#endif // ZIM_TOOLS_H diff --git a/include/zim/uuid.h b/include/zim/uuid.h new file mode 100644 index 0000000..6544eca --- /dev/null +++ b/include/zim/uuid.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2021 Mannesh P M + * Copyright (C) 2018 Matthieu Gautier + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_UUID_H +#define ZIM_UUID_H + +#include +#include +#include +#include + +namespace zim +{ + struct Uuid + { + Uuid() + { + std::memset(data, 0, 16); + } + + Uuid(const char uuid[16]) + { + std::copy(uuid, uuid+16, data); + } + + static Uuid generate(std::string value = ""); + + bool operator== (const Uuid& other) const + { return std::equal(data, data+16, other.data); } + bool operator!= (const Uuid& other) const + { return !(*this == other); } + unsigned size() const { return 16; } + + explicit operator std::string() const; + + char data[16]; + }; + + std::ostream& operator<< (std::ostream& out, const Uuid& uuid); + +} + +#endif // ZIM_UUID_H diff --git a/include/zim/version.h b/include/zim/version.h new file mode 100644 index 0000000..f94a532 --- /dev/null +++ b/include/zim/version.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2021 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_VERSION_H +#define ZIM_VERSION_H + +#include +#include + +namespace zim +{ + typedef std::vector> LibVersions; + LibVersions getVersions(); + void printVersions(std::ostream& out = std::cout); +} + +#endif // ZIM_VERSION_H + diff --git a/include/zim/writer/contentProvider.h b/include/zim/writer/contentProvider.h new file mode 100644 index 0000000..eadd135 --- /dev/null +++ b/include/zim/writer/contentProvider.h @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_CONTENTPROVIDER_H +#define ZIM_WRITER_CONTENTPROVIDER_H + +#include +#include +#include +#include + +namespace zim +{ +#ifdef _WIN32 + #define DEFAULTFD zim::windows::FD + namespace windows { +#else + #define DEFAULTFD zim::unix::FD + namespace unix { +#endif + class FD; + } + namespace writer + { + /** + * `ContentProvider` is an abstract class in charge of providing the content to + * add in the archive to the creator. + */ + class ContentProvider { + public: + virtual ~ContentProvider() = default; + /** + * The size of the content to add into the archive. + * + * @return the total size of the content. + */ + virtual zim::size_type getSize() const = 0; + + /** + * Return a blob to add to the archive. + * + * The returned blob doesn't have to represent the whole content. + * The feed method can return the whole content chunk by chunk or in + * one step. + * When the whole content has been returned, feed must return an empty blob + * (size == 0). + * + * This method will be called several times (at least twice) for + * each content to add. + * + * It is up to the implementation to manage correctly the data pointed by + * the returned blob. + * It may (re)use the same buffer between calls (rewriting its content), + * create a new buffer each time or make the blob point to a new region of + * a big buffer. + * It is up to the implementation to free any allocated memory. + * + * The data pointed by the blob must stay valid until the next call to feed. + * A call to feed ensure that the data returned by a previous call will not + * be used anymore. + */ + virtual Blob feed() = 0; + }; + + /** + * StringProvider provide the content stored in a string. + */ + class StringProvider : public ContentProvider { + public: + /** + * Create a provider using a string as content. + * The string content is copied and the reference don't have to be "keep" alive. + * + * @param content the content to serve. + */ + explicit StringProvider(const std::string& content) + : content(content), + feeded(false) + {} + zim::size_type getSize() const { return content.size(); } + Blob feed(); + + protected: + std::string content; + bool feeded; + }; + + /** + * SharedStringProvider provide the content stored in a shared string. + * + * It is mostly the same thing that `StringProvider` but use a shared_ptr + * to avoid copy. + */ + class SharedStringProvider : public ContentProvider { + public: + /** + * Create a provider using a string as content. + * The string content is not copied. + * + * @param content the content to serve. + */ + explicit SharedStringProvider(std::shared_ptr content) + : content(content), + feeded(false) + {} + zim::size_type getSize() const { return content->size(); } + Blob feed(); + + protected: + std::shared_ptr content; + bool feeded; + + }; + + /** + * FileProvider provide the content stored in file. + */ + class FileProvider : public ContentProvider { + public: + /** + * Create a provider using file as content. + * + * @param filepath the path to the file to serve. + */ + explicit FileProvider(const std::string& filepath); + ~FileProvider(); + zim::size_type getSize() const { return size; } + Blob feed(); + + protected: + std::string filepath; + zim::size_type size; + + private: + std::unique_ptr buffer; + std::unique_ptr fd; + zim::offset_type offset; + }; + + } +} + +#undef DEFAULTFD + +#endif // ZIM_WRITER_CONTENTPROVIDER_H diff --git a/include/zim/writer/creator.h b/include/zim/writer/creator.h new file mode 100644 index 0000000..00f6414 --- /dev/null +++ b/include/zim/writer/creator.h @@ -0,0 +1,228 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_CREATOR_H +#define ZIM_WRITER_CREATOR_H + +#include +#include +#include + +namespace zim +{ + class Fileheader; + namespace writer + { + class CreatorData; + + /** + * The `Creator` is responsible to create a zim file. + * + * Once the `Creator` is instantiated, it can be configured with the + * `config*` methods. + * Then the creation process must be started with `startZimCreation`. + * Elements of the zim file can be added using the `add*` methods. + * The final steps is to call `finishZimCreation`. + * + * During the creation of the zim file (and before the call to `finishZimCreation`), + * some values must be set using the `set*` methods. + * + * All `add*` methods can throw a std::runtime_error exception if the entry + * cannot be added (mainly because a entry with the same path has already been added). + * It is up to the user to catch this exception and handle the error. + * The creator is still in a valid state and the creation can continue. + */ + class Creator + { + public: + /** + * Creator constructor. + * + * @param verbose If the creator print verbose information. + * @param comptype The compression algorithm to use. + */ + Creator(); + virtual ~Creator(); + + /** + * Configure the verbosity of the creator + * + * @param verbose if the creator print verbose information. + * @return a reference to itself. + */ + Creator& configVerbose(bool verbose); + + /** + * Configure the compression algorithm to use. + * + * @param comptype the compression algorithm to use. + * @return a reference to itself. + */ + Creator& configCompression(Compression compression); + + /** + * Set the size of the created clusters. + * + * The creator will try to create cluster with (uncompressed) size + * as close as possible to targetSize without exceeding that limit. + * If not possible, the only such case being an item larger than targetSize, + * a separated cluster will be allocated for that oversized item. + * + * Be carefull with this value. + * Bigger value means more content put together, so a better compression ratio. + * But it means also that more decompression has to be made when reading a blob. + * If you don't know which value to put, don't use this method and let libzim + * use the default value. + * + * @param targetSize The target size of a cluster (in byte). + * @return a reference to itself. + */ + Creator& configClusterSize(zim::size_type targetSize); + + /** + * Configure the fulltext indexing feature. + * + * @param indexing True if we must fulltext index the content. + * @param language Language to use for the indexation. + * @return a reference to itself. + */ + Creator& configIndexing(bool indexing, const std::string& language); + + /** + * Set the number of thread to use for the internal worker. + * + * @param nbWorkers The number of workers to use. + * @return a reference to itself. + */ + Creator& configNbWorkers(unsigned nbWorkers); + + /** + * Start the zim creation. + * + * The creator must have been configured before calling this method. + * + * @param filepath the path of the zim file to create. + */ + void startZimCreation(const std::string& filepath); + + /** + * Add a item to the archive. + * + * @param item The item to add. + */ + void addItem(std::shared_ptr item); + + /** + * Add a metadata to the archive. + * + * @param name the name of the metadata + * @param content the content of the metadata + * @param mimetype the mimetype of the metadata. + * Only used to detect if the metadata must be compressed or not. + */ + void addMetadata(const std::string& name, const std::string& content, const std::string& mimetype = "text/plain;charset=utf-8"); + + /** + * Add a metadata to the archive using a contentProvider instead of plain string. + * + * @param name the name of the metadata. + * @param provider the provider of the content of the metadata. + * @param mimetype the mimetype of the metadata. + * Only used to detect if the metadata must be compressed. + */ + void addMetadata(const std::string& name, std::unique_ptr provider, const std::string& mimetype = "text/plain;charset=utf-8"); + + /** + * Add illustration to the archive. + * + * @param size the size (width and height) of the illustration. + * @param content the content of the illustration (must be a png content) + */ + void addIllustration(unsigned int size, const std::string& content); + + /** + * Add illustration to the archive. + * + * @param size the size (width and height) of the illustration. + * @param provider the provider of the content of the illustration (must be a png content) + */ + void addIllustration(unsigned int size, std::unique_ptr provider); + + /** + * Add a redirection to the archive. + * + * Hints (especially FRONT_ARTICLE) can be used to put the redirection + * in the front articles list. + * By default, redirections are not front article. + * + * @param path the path of the redirection. + * @param title the title of the redirection. + * @param targetpath the path of the target of the redirection. + * @param hints hints associated to the redirection. + */ + void addRedirection( + const std::string& path, + const std::string& title, + const std::string& targetpath, + const Hints& hints = Hints()); + + /** + * Finalize the zim creation. + */ + void finishZimCreation(); + + /** + * Set the path of the main page. + * + * @param mainPath The path of the main page. + */ + void setMainPath(const std::string& mainPath) { m_mainPath = mainPath; } + + /** + * Set the uuid of the the archive. + * + * @param uuid The uuid of the archive. + */ + void setUuid(const zim::Uuid& uuid) { m_uuid = uuid; } + + private: + std::unique_ptr data; + + // configuration + bool m_verbose = false; + Compression m_compression = Compression::Zstd; + bool m_withIndex = false; + size_t m_clusterSize; + std::string m_indexingLanguage; + unsigned m_nbWorkers = 4; + + // zim data + std::string m_mainPath; + Uuid m_uuid = Uuid::generate(); + + void fillHeader(Fileheader* header) const; + void writeLastParts() const; + }; + } + +} + +#endif // ZIM_WRITER_CREATOR_H diff --git a/include/zim/writer/item.h b/include/zim/writer/item.h new file mode 100644 index 0000000..3530d05 --- /dev/null +++ b/include/zim/writer/item.h @@ -0,0 +1,262 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_ITEM_H +#define ZIM_WRITER_ITEM_H + +#include +#include +#include +#include +#include + +#include + +namespace zim +{ + namespace writer + { + enum HintKeys { + COMPRESS, + FRONT_ARTICLE, + }; + using Hints = std::map; + + class ContentProvider; + class IndexData { + public: + using GeoPosition = std::tuple; + virtual ~IndexData() = default; + virtual bool hasIndexData() const = 0; + virtual std::string getTitle() const = 0; + virtual std::string getContent() const = 0; + virtual std::string getKeywords() const = 0; + virtual uint32_t getWordCount() const = 0; + virtual GeoPosition getGeoPosition() const = 0; + }; + + /** + * Item represent data to be added to the archive. + * + * This is a abstract class the user need to implement. + * libzim provides `BasicItem`, `StringItem` and `FileItem` + * to simplify (or avoid) this reimplementation. + */ + class Item + { + public: + /** + * The path of the item. + * + * The path must be absolute. + * Path must be unique. + * + * @return the path of the item. + */ + virtual std::string getPath() const = 0; + + /** + * The title of the item. + * + * Item's title is indexed and is used for the suggestion system. + * Title don't have to be unique. + * + * @return the title of the item. + */ + virtual std::string getTitle() const = 0; + + /** + * The mimetype of the item. + * + * Mimetype is store within the content. + * It is also used to detect if the content must be compressed or not. + * + * @return the mimetype of the item. + */ + virtual std::string getMimeType() const = 0; + + /** + * The content provider of the item. + * + * The content provider is responsible to provide the content to the creator. + * The returned content provider must stay valid even after creator release + * its reference to the item. + * + * This method will be called once by libzim, in the main thread + * (but will be used in a different thread). + * The default IndexData will also call this method once (more) + * in the main thread (and use it in another thread). + * + * @return the contentProvider of the item. + */ + virtual std::unique_ptr getContentProvider() const = 0; + + /** + * The index data of the item. + * + * The index data is the data to index. (May be different from the content + * to store). + * The returned index data must stay valid even after creator release + * its reference to the item. + * This method will be called once by libzim if it is compiled with xapian + * (and is configured to index data). + * + * The returned IndexData will be used as source to index the item. + * If you don't want the item to be indexed, you can return a nullptr here + * or return a valid IndexData pointer which will return false to `hasIndexData`. + * + * If you don't implement this method, a default implementation will be used. + * The default implementation first checks for the mimetype and if the mimetype + * contains `text/html` it will use a contentProvider to get the content to index. + * The contentProvider will be created in the main thread but the data reading and + * parsing will occur in a different thread. + * + * All methods of `IndexData` will be called in a different (same) thread. + * + * @return the indexData of the item. + * May return a nullptr if there is no indexData. + */ + virtual std::shared_ptr getIndexData() const; + + /** + * Hints to help the creator takes decision about the item. + * + * For now two hints are supported: + * - COMPRESS: Can be used to force the creator to put the item content + * in a compressed cluster (if true) or not (if false). + * If the hint is not provided, the decision is taken based on the + * mimetype (textual or binary content ?) + * - FRONT_ARTICLE: Can (Should) be used to specify if the item is + * a front article or not. + * If the hint is not provided, the decision is taken based on the + * mimetype (html or not ?) + * + * @return A list of hints. + */ + virtual Hints getHints() const; + + /** + * Returns the getHints() amended with default values based on mimetypes. + */ + Hints getAmendedHints() const; + virtual ~Item() = default; + }; + + /** + * A BasicItem is a partial implementation of a Item. + * + * `BasicItem` provides a basic implementation for everything about an `Item` + * but the actual content of the item. + */ + class BasicItem : public Item + { + public: + /** + * Create a BasicItem with the given path, mimetype and title. + * + * @param path the path of the item. + * @param mimetype the mimetype of the item. + * @param title the title of the item. + */ + BasicItem(const std::string& path, const std::string& mimetype, const std::string& title, Hints hints) + : path(path), + mimetype(mimetype), + title(title), + hints(hints) + {} + + std::string getPath() const { return path; } + std::string getTitle() const { return title; } + std::string getMimeType() const { return mimetype; } + Hints getHints() const { return hints; } + + protected: + std::string path; + std::string mimetype; + std::string title; + Hints hints; + }; + + /** + * A `StringItem` is a full implemented item where the content is stored in a string. + */ + class StringItem : public BasicItem, public std::enable_shared_from_this + { + public: + /** + * Create a StringItem with the given path, mimetype, title and content. + * + * The parameters are the ones of the private constructor. + * + * @param path the path of the item. + * @param mimetype the mimetype of the item. + * @param title the title of the item. + * @param content the content of the item. + */ + template + static std::shared_ptr create(Ts&&... params) { + return std::shared_ptr(new StringItem(std::forward(params)...)); + } + + std::unique_ptr getContentProvider() const; + + protected: + std::string content; + + private: + StringItem(const std::string& path, const std::string& mimetype, + const std::string& title, Hints hints, const std::string& content) + : BasicItem(path, mimetype, title, hints), + content(content) + {} + + + + }; + + /** + * A `FileItem` is a full implemented item where the content is file. + */ + class FileItem : public BasicItem + { + public: + /** + * Create a FileItem with the given path, mimetype, title and filenpath. + * + * @param path the path of the item. + * @param mimetype the mimetype of the item. + * @param title the title of the item. + * @param filepath the path of the file in the filesystem. + */ + FileItem(const std::string& path, const std::string& mimetype, + const std::string& title, Hints hints, const std::string& filepath) + : BasicItem(path, mimetype, title, hints), + filepath(filepath) + {} + + std::unique_ptr getContentProvider() const; + + protected: + std::string filepath; + }; + + } +} + +#endif // ZIM_WRITER_ITEM_H diff --git a/include/zim/zim.h b/include/zim/zim.h new file mode 100644 index 0000000..7cd8984 --- /dev/null +++ b/include/zim/zim.h @@ -0,0 +1,117 @@ +/* + * Copyright (C) 2020-2021 Veloman Yunkan + * Copyright (C) 2018-2020 Matthieu Gautier + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ZIM_H +#define ZIM_ZIM_H + +#include + +#ifdef __GNUC__ +#define DEPRECATED __attribute__((deprecated)) +#elif defined(_MSC_VER) +#define DEPRECATED __declspec(deprecated) +#else +#praga message("WARNING: You need to implement DEPRECATED for this compiler") +#define DEPRECATED +#endif + + +#include + +namespace zim +{ + // An index of an entry (in a zim file) + typedef uint32_t entry_index_type; + + // An index of an cluster (in a zim file) + typedef uint32_t cluster_index_type; + + // An index of a blog (in a cluster) + typedef uint32_t blob_index_type; + + // The size of something (entry, zim, cluster, blob, ...) + typedef uint64_t size_type; + + // An offset. + typedef uint64_t offset_type; + + enum class Compression + { + None = 1, + + // intermediate values correspond to compression + // methods that are no longer supported + + Zstd = 5 + }; + + static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate"; + + /** + * Various types of integrity checks performed by `zim::validate()`. + */ + enum class IntegrityCheck + { + /** + * Validates the checksum of the ZIM file. + */ + CHECKSUM, + + /** + * Checks that offsets in UrlPtrList are valid. + */ + DIRENT_PTRS, + + /** + * Checks that dirents are properly sorted. + */ + DIRENT_ORDER, + + /** + * Checks that entries in the title index are valid and properly sorted. + */ + TITLE_INDEX, + + /** + * Checks that offsets in ClusterPtrList are valid. + */ + CLUSTER_PTRS, + + /** + * Checks that mime-type values in dirents are valid. + */ + DIRENT_MIMETYPES, + + //////////////////////////////////////////////////////////////////////////// + // End of integrity check types. + // COUNT must be the last one and denotes the count of all checks + //////////////////////////////////////////////////////////////////////////// + + /** + * `COUNT` is not a valid integrity check type. It exists to tell the + * number of all supported integrity checks. + */ + COUNT + }; +} + +#endif // ZIM_ZIM_H + diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..8e2c1bc --- /dev/null +++ b/meson.build @@ -0,0 +1,101 @@ +project('libzim', ['c', 'cpp'], + version : '8.0.0', + license : 'GPL2', + default_options : ['c_std=c11', 'cpp_std=c++11']) + +if build_machine.system() != 'windows' + add_project_arguments('-D_LARGEFILE64_SOURCE=1', '-D_FILE_OFFSET_BITS=64', language: 'cpp') +endif + +cpp = meson.get_compiler('cpp') +sizeof_off_t = cpp.sizeof('off_t') +sizeof_size_t = cpp.sizeof('size_t') + +private_conf = configuration_data() +public_conf = configuration_data() + +private_conf.set('VERSION', '"@0@"'.format(meson.project_version())) +public_conf.set('LIBZIM_VERSION', '"@0@"'.format(meson.project_version())) +private_conf.set('DIRENT_CACHE_SIZE', get_option('DIRENT_CACHE_SIZE')) +private_conf.set('DIRENT_LOOKUP_CACHE_SIZE', get_option('DIRENT_LOOKUP_CACHE_SIZE')) +private_conf.set('CLUSTER_CACHE_SIZE', get_option('CLUSTER_CACHE_SIZE')) +private_conf.set('LZMA_MEMORY_SIZE', get_option('LZMA_MEMORY_SIZE')) +private_conf.set10('MMAP_SUPPORT_64', sizeof_off_t==8) +private_conf.set10('ENV64BIT', sizeof_size_t==8) +private_conf.set10('ENV32BIT', sizeof_size_t==4) +if host_machine.system() == 'windows' + private_conf.set('ENABLE_USE_MMAP', false) + add_project_arguments('-DNOMINMAX', language: 'cpp') +else + private_conf.set('ENABLE_USE_MMAP', get_option('USE_MMAP')) +endif +private_conf.set('ENABLE_USE_BUFFER_HEADER', get_option('USE_BUFFER_HEADER')) + +static_linkage = get_option('static-linkage') +static_linkage = static_linkage or get_option('default_library')=='static' + +lzma_dep = dependency('liblzma', static:static_linkage) +if static_linkage + add_project_arguments('-DLZMA_API_STATIC', language: 'cpp') +endif + +zstd_dep = dependency('libzstd', static:static_linkage) + +if host_machine.system() == 'freebsd' + execinfo_dep = cpp.find_library('execinfo') +endif + +if get_option('with_xapian') + xapian_dep = dependency('xapian-core', static:static_linkage) +else + xapian_dep = dependency('', required:false) +endif +private_conf.set('ENABLE_XAPIAN', xapian_dep.found()) +public_conf.set('LIBZIM_WITH_XAPIAN', xapian_dep.found()) + +pkg_requires = ['liblzma', 'libzstd'] +if build_machine.system() == 'windows' + extra_link_args = ['-lRpcrt4', '-lWs2_32', '-lwinmm', '-licuuc', '-licuin'] + extra_cpp_args = ['-DSORTPP_PASS'] +else + extra_link_args = [] + extra_cpp_args = [] +endif + +compiler = meson.get_compiler('cpp') +if (compiler.get_id() == 'gcc' and build_machine.system() == 'linux') or host_machine.system() == 'freebsd' + # C++ std::thread is implemented using pthread on linux by gcc + thread_dep = dependency('threads') +else + thread_dep = dependency('', required:false) +endif + +if xapian_dep.found() + pkg_requires += ['xapian-core'] + icu_dep = dependency('icu-i18n', static:static_linkage) + pkg_requires += ['icu-i18n'] +else + icu_dep = dependency('icu-i18n', required:false, static:static_linkage) +endif + +gtest_dep = dependency('gtest', main:true, fallback:['gtest', 'gtest_main_dep'], required:false) + +inc = include_directories('include') + +subdir('include') +subdir('scripts') +subdir('static') +subdir('src') +subdir('examples') +subdir('test') +if get_option('doc') + subdir('docs') +endif + +pkg_mod = import('pkgconfig') +pkg_mod.generate(libraries : libzim, + version : meson.project_version(), + name : 'libzim', + filebase : 'libzim', + description : 'A Library to read/write ZIM files.', + requires : pkg_requires) diff --git a/meson_options.txt b/meson_options.txt new file mode 100644 index 0000000..84242ed --- /dev/null +++ b/meson_options.txt @@ -0,0 +1,22 @@ +option('CLUSTER_CACHE_SIZE', type : 'string', value : '16', + description : 'set cluster cache size to number (default:16)') +option('DIRENT_CACHE_SIZE', type : 'string', value : '512', + description : 'set dirent cache size to number (default:512)') +option('DIRENT_LOOKUP_CACHE_SIZE', type : 'string', value : '1024', + description : 'set dirent lookup cache size to number (default:1024)') +option('LZMA_MEMORY_SIZE', type : 'string', value : '128', + description : 'set lzma uncompress memory in MB (default:128)') +option('USE_MMAP', type: 'boolean', value: true, + description: 'Use mmap to avoid copy from file. (default:true, always false on windows)') +option('USE_BUFFER_HEADER', type: 'boolean', value: true, + description: '''Copy (or use mmap) header index buffers. (default:true) +Header index are used to access articles, having them in memory can improve access speed but on low memory devices it may use to many memory. +If false, we directly read the index in the file at each article access.''') +option('static-linkage', type : 'boolean', value : false, + description : 'Link statically with the dependencies.') +option('doc', type : 'boolean', value : false, + description : 'Build the documentations.') +option('with_xapian', type : 'boolean', value: true, + description: 'Build libzim with xapian support') +option('test_data_dir', type : 'string', value: '', + description: 'Where the test data are. If not set, meson will use a internal directory in the build dir. If you want to download the data in the specified directory you can use `meson download_test_data`. As a special value, you can pass `none` to deactivate test using external test data.') diff --git a/scripts/download_test_data.py b/scripts/download_test_data.py new file mode 100755 index 0000000..d320fb3 --- /dev/null +++ b/scripts/download_test_data.py @@ -0,0 +1,64 @@ +#!/usr/bin/env python3 + +''' +Copyright 2021 Matthieu Gautier + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or any +later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301, USA. +''' + +import argparse +from pathlib import Path +from urllib import request +from urllib.error import * +import tarfile +import sys + +TEST_DATA_VERSION = "0.3" +ARCHIVE_URL_TEMPL = "https://github.com/openzim/zim-testing-suite/releases/download/v{version}/zim-testing-suite-{version}.tar.gz" + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--version', '-v', + help="The version to download.", + default=TEST_DATA_VERSION) + parser.add_argument('--remove-top-dir', + help="Remove the top directory when extracting", + action='store_true') + parser.add_argument('outdir', + help='The directory where to install the test data.') + args = parser.parse_args() + + test_data_url = ARCHIVE_URL_TEMPL.format(version=args.version) + + try: + with request.urlopen(test_data_url) as f: + with tarfile.open(fileobj=f, mode="r|*") as archive: + while True: + member = archive.next() + if member is None: + break + if args.remove_top_dir: + member.name = '/'.join(member.name.split('/')[1:]) + archive.extract(member, path=args.outdir) + + except HTTPError as e: + print("Error downloading archive at url : {}".format(test_data_url)) + print(e) + sys.exit(1) + except OSError as e: + print("Error writing the test data on the file system.") + print(e) + sys.exit(1) diff --git a/scripts/libzim-compile-resources b/scripts/libzim-compile-resources new file mode 100755 index 0000000..e4993ba --- /dev/null +++ b/scripts/libzim-compile-resources @@ -0,0 +1,201 @@ +#!/usr/bin/env python3 + +''' +Copyright 2016 Matthieu Gautier + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 3 of the License, or any +later version. + +This program is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA +02110-1301, USA. +''' + +import argparse +import os.path +import re + +def full_identifier(filename): + parts = os.path.normpath(filename).split(os.sep) + parts = [to_identifier(part) for part in parts] + print(filename, parts) + return parts + +def to_identifier(name): + ident = re.sub(r'[^0-9a-zA-Z]', '_', name) + if ident[0].isnumeric(): + return "_"+ident + return ident + +resource_impl_template = """ +static const unsigned char {data_identifier}[] = {{ + {resource_content} +}}; + +namespace RESOURCE {{ +{namespaces_open} +const std::string {identifier} = init_resource("{env_identifier}", {data_identifier}, {resource_len}); +{namespaces_close} +}} +""" + +resource_getter_template = """ + if (name == "{common_name}") + return RESOURCE::{identifier}; +""" + +resource_decl_template = """{namespaces_open} +extern const std::string {identifier}; +{namespaces_close}""" + +class Resource: + def __init__(self, base_dirs, filename): + filename = filename.strip() + self.filename = filename + self.identifier = full_identifier(filename) + found = False + for base_dir in base_dirs: + try: + with open(os.path.join(base_dir, filename), 'rb') as f: + self.data = f.read() + found = True + break + except FileNotFoundError: + continue + if not found: + raise Exception("Impossible to found {}".format(filename)) + + def dump_impl(self): + nb_row = len(self.data)//16 + (1 if len(self.data) % 16 else 0) + sliced = (self.data[i*16:(i+1)*16] for i in range(nb_row)) + + return resource_impl_template.format( + data_identifier="_".join([""]+self.identifier), + resource_content=",\n ".join(", ".join("{:#04x}".format(i) for i in r) for r in sliced), + resource_len=len(self.data), + namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), + namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), + identifier=self.identifier[-1], + env_identifier="RES_"+"_".join(self.identifier)+"_PATH" + ) + + def dump_getter(self): + return resource_getter_template.format( + common_name=self.filename, + identifier="::".join(self.identifier) + ) + + def dump_decl(self): + return resource_decl_template.format( + namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), + namespaces_close=" ".join(["}"]*(len(self.identifier)-1)), + identifier=self.identifier[-1] + ) + + + +master_c_template = """//This file is automaically generated. Do not modify it. + +#include +#include +#include "{include_file}" + +static std::string init_resource(const char* name, const unsigned char* content, int len) +{{ + char * resPath = getenv(name); + if (NULL == resPath) + return std::string(reinterpret_cast(content), len); + + std::ifstream ifs(resPath); + if (!ifs.good()) + return std::string(reinterpret_cast(content), len); + return std::string( (std::istreambuf_iterator(ifs)), + (std::istreambuf_iterator() )); +}} + +const std::string& getResource_{basename}(const std::string& name) {{ +{RESOURCES_GETTER} + throw ResourceNotFound("Resource not found."); +}} + +{RESOURCES} + +""" + +def gen_c_file(resources, basename): + return master_c_template.format( + RESOURCES="\n\n".join(r.dump_impl() for r in resources), + RESOURCES_GETTER="\n\n".join(r.dump_getter() for r in resources), + include_file=basename, + basename=to_identifier(basename) + ) + + + +master_h_template = """//This file is automaically generated. Do not modify it. +#ifndef KIWIX_{BASENAME} +#define KIWIX_{BASENAME} + +#include +#include + +namespace RESOURCE {{ + {RESOURCES} +}}; + +class ResourceNotFound : public std::runtime_error {{ + public: + ResourceNotFound(const std::string& what_arg): + std::runtime_error(what_arg) + {{ }}; +}}; + +const std::string& getResource_{basename}(const std::string& name); + +#define getResource(a) (getResource_{basename}(a)) + +#endif // KIWIX_{BASENAME} + +""" + +def gen_h_file(resources, basename): + return master_h_template.format( + RESOURCES="\n ".join(r.dump_decl() for r in resources), + BASENAME=basename.upper(), + basename=basename, + ) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--cxxfile', + help='The Cpp file name to generate') + parser.add_argument('--hfile', + help='The h file name to generate') + parser.add_argument('--source_dir', + help="Additional directory where to look for resources.", + action='append') + parser.add_argument('resource_file', + help='The list of resources to compile.') + args = parser.parse_args() + + base_dir = os.path.dirname(os.path.realpath(args.resource_file)) + source_dir = args.source_dir or [] + with open(args.resource_file, 'r') as f: + resources = [Resource([base_dir]+source_dir, filename) + for filename in f.readlines()] + + h_identifier = to_identifier(os.path.basename(args.hfile)) + with open(args.hfile, 'w') as f: + f.write(gen_h_file(resources, h_identifier)) + + with open(args.cxxfile, 'w') as f: + f.write(gen_c_file(resources, os.path.basename(args.hfile))) + diff --git a/scripts/meson.build b/scripts/meson.build new file mode 100644 index 0000000..7e215a9 --- /dev/null +++ b/scripts/meson.build @@ -0,0 +1,3 @@ + +res_compiler = find_program('libzim-compile-resources') +test_data_downloader = find_program('download_test_data.py') diff --git a/src/_dirent.h b/src/_dirent.h new file mode 100644 index 0000000..907e9e1 --- /dev/null +++ b/src/_dirent.h @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2018-2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yankan + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_DIRENT_H +#define ZIM_DIRENT_H + +#include +#include +#include +#include + +#include "zim_types.h" +#include "debug.h" + +namespace zim +{ + class Buffer; + class InvalidSize : public std::exception {}; + class Dirent + { + protected: + uint16_t mimeType; + + uint32_t version; + + cluster_index_t clusterNumber; // only used when redirect is false + blob_index_t blobNumber; // only used when redirect is false + + entry_index_t redirectIndex; // only used when redirect is true + + char ns; + std::string title; + std::string url; + std::string parameter; + + public: + // these constants are put into mimeType field + static const uint16_t redirectMimeType = 0xffff; + static const uint16_t linktargetMimeType = 0xfffe; + static const uint16_t deletedMimeType = 0xfffd; + + Dirent() + : mimeType(0), + version(0), + clusterNumber(0), + blobNumber(0), + redirectIndex(0), + ns('\0') + {} + + bool isRedirect() const { return mimeType == redirectMimeType; } + bool isLinktarget() const { return mimeType == linktargetMimeType; } + bool isDeleted() const { return mimeType == deletedMimeType; } + bool isArticle() const { return !isRedirect() && !isLinktarget() && !isDeleted(); } + uint16_t getMimeType() const { return mimeType; } + + uint32_t getVersion() const { return version; } + void setVersion(uint32_t v) { version = v; } + + cluster_index_t getClusterNumber() const { return isRedirect() ? cluster_index_t(0) : clusterNumber; } + blob_index_t getBlobNumber() const { return isRedirect() ? blob_index_t(0) : blobNumber; } + + entry_index_t getRedirectIndex() const { return isRedirect() ? redirectIndex : entry_index_t(0); } + + char getNamespace() const { return ns; } + const std::string& getTitle() const { return title.empty() ? url : title; } + const std::string& getUrl() const { return url; } + std::string getLongUrl() const; + const std::string& getParameter() const { return parameter; } + + size_t getDirentSize() const + { + size_t ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2; + if (title != url) + ret += title.size(); + return ret; + } + + void setTitle(const std::string& title_) + { + title = title_; + } + + void setUrl(char ns_, const std::string& url_) + { + ns = ns_; + url = url_; + } + + void setParameter(const std::string& parameter_) + { + parameter = parameter_; + } + + void setRedirect(entry_index_t idx) + { + redirectIndex = idx; + mimeType = redirectMimeType; + } + + void setItem(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_) + { + mimeType = mimeType_; + clusterNumber = clusterNumber_; + blobNumber = blobNumber_; + } + }; +} + +#endif // ZIM_DIRENT_H diff --git a/src/archive.cpp b/src/archive.cpp new file mode 100644 index 0000000..1c9e32c --- /dev/null +++ b/src/archive.cpp @@ -0,0 +1,528 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2020-2021 Veloman Yunkan + * Copyright (C) 2020-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#define ZIM_PRIVATE +#include +#include +#include +#include +#include "fileimpl.h" +#include "tools.h" +#include "log.h" + +log_define("zim.archive") + +namespace zim +{ + Archive::Archive(const std::string& fname) + : m_impl(new FileImpl(fname)) + { } + +#ifndef _WIN32 + Archive::Archive(int fd) + : m_impl(new FileImpl(fd)) + { } + + Archive::Archive(int fd, offset_type offset, size_type size) + : m_impl(new FileImpl(fd, offset_t(offset), zsize_t(size))) + { } +#endif + + const std::string& Archive::getFilename() const + { + return m_impl->getFilename(); + } + + size_type Archive::getFilesize() const + { + return m_impl->getFilesize().v; + } + + entry_index_type Archive::getAllEntryCount() const + { + return m_impl->getCountArticles().v; + } + + entry_index_type Archive::getEntryCount() const + { + return m_impl->getUserEntryCount().v; + } + + entry_index_type Archive::getArticleCount() const + { + if (m_impl->hasFrontArticlesIndex()) { + return m_impl->getFrontEntryCount().v; + } else if (m_impl->hasNewNamespaceScheme()) { + return m_impl->getNamespaceEntryCount('C').v; + } else { + return m_impl->getNamespaceEntryCount('A').v; + } + } + + Uuid Archive::getUuid() const + { + return m_impl->getFileheader().getUuid(); + } + + Item Archive::getMetadataItem(const std::string& name) const + { + auto r = m_impl->findx('M', name); + if (!r.first) { + throw EntryNotFound("Cannot find metadata"); + } + auto entry = Entry(m_impl, entry_index_type(r.second)); + return entry.getItem(true); + } + + std::string Archive::getMetadata(const std::string& name) const + { + auto item = getMetadataItem(name); + return item.getData(); + } + + std::vector Archive::getMetadataKeys() const { + std::vector ret; + auto start = m_impl->getNamespaceBeginOffset('M'); + auto end = m_impl->getNamespaceEndOffset('M'); + for (auto idx=start; idx!=end; idx++) { + auto dirent = m_impl->getDirent(idx); + ret.push_back(dirent->getUrl()); + } + return ret; + } + + zim::FileImpl::FindxResult findFavicon(FileImpl& impl) + { + for(auto ns:{'-', 'I'}) { + for (auto& path:{"favicon", "favicon.png"}) { + auto r = impl.findx(ns, path); + if (r.first) { + return r; + } + } + } + throw EntryNotFound("No favicon found."); + } + + Item Archive::getIllustrationItem(unsigned int size) const { + std::ostringstream ss; + ss << "Illustration_" << size << "x" << size << "@" << 1; + auto r = m_impl->findx('M', ss.str()); + if (r.first) { + return getEntryByPath(entry_index_type(r.second)).getItem(); + } + // We haven't found the exact entry. Let's "search" for a illustration and + // use the first one we found. +#if 0 + // We have decided to not implement fallback in case of wrong resolution for now. + // We keep this code for reference. + r = m_impl->findx('M', "Illustration"); + auto entry = getEntryByPath(entry_index_type(r.second)); + if (entry.getPath().find("Illustration") == 0) { + return entry.getItem(); + } +#endif + // For 48x48 illustration, return favicon for older zims. + if (size == 48) { + auto r = findFavicon(*m_impl); + return getEntryByPath(entry_index_type(r.second)).getItem(true); + } + throw EntryNotFound("Cannot find illustration item."); + } + + std::set Archive::getIllustrationSizes() const { + std::set ret; + for(auto r = m_impl->findx('M', "Illustration_").second; + /*No exit test*/; + r++ + ) { + try { + auto path = getEntryByPath(entry_index_type(r)).getPath(); + if (path.find("Illustration_") != 0) { + break; + } + try { + ret.insert(parseIllustrationPathToSize(path)); + } catch (...) {} + } catch (const std::out_of_range& e) { + break; + } + } + if (ret.find(48) == ret.end()) { + try { + // raise a exception if we cannot find the (old format) favicon. + findFavicon(*m_impl); + ret.insert(48); + } catch(EntryNotFound&) {} + } + return ret; + } + + bool Archive::hasIllustration(unsigned int size) const { + try { + getIllustrationItem(size); + return true; + } catch (EntryNotFound& e) { + return false; + } + } + + Entry Archive::getEntryByPath(entry_index_type idx) const + { + if (idx >= entry_index_type(m_impl->getCountArticles())) + throw std::out_of_range("entry index out of range"); + return Entry(m_impl, idx); + } + + Entry Archive::getEntryByPath(const std::string& path) const + { + if (m_impl->hasNewNamespaceScheme()) { + // Get path in user content. + auto r = m_impl->findx('C', path); + if (r.first) { + return Entry(m_impl, entry_index_type(r.second)); + } + try { + // Path may come from a already stored from a old zim archive (bookmark), + // and so contains a namespace. + // We have to adapt the path to use the C namespace. + r = m_impl->findx('C', std::get<1>(parseLongPath(path))); + if (r.first) { + return Entry(m_impl, entry_index_type(r.second)); + } + } catch (std::runtime_error&) {} + } else { + // Path should contains the namespace. + auto r = m_impl->findx(path); + if (r.first) { + return Entry(m_impl, entry_index_type(r.second)); + } + // If not (bookmark) from a recent zim archive. + for (auto ns:{'A', 'I', 'J', '-'}) { + r = m_impl->findx(ns, path); + if (r.first) { + return Entry(m_impl, entry_index_type(r.second)); + } + } + } + + throw EntryNotFound("Cannot find entry"); + } + + Entry Archive::getEntryByTitle(entry_index_type idx) const + { + return Entry(m_impl, entry_index_type(m_impl->getIndexByTitle(title_index_t(idx)))); + } + + Entry Archive::getEntryByTitle(const std::string& title) const + { + for (auto ns:{'C', 'A', 'I', 'J', '-'}) { + log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')'); + auto r = m_impl->findxByTitle(ns, title); + if (r.first) + return getEntryByTitle(entry_index_type(r.second)); + } + throw EntryNotFound("Cannot find entry"); + } + + Entry Archive::getEntryByClusterOrder(entry_index_type idx) const + { + return Entry(m_impl, entry_index_type(m_impl->getIndexByClusterOrder(entry_index_t(idx)))); + } + + Entry Archive::getMainEntry() const { + auto r = m_impl->findx('W', "mainPage"); + if (r.first) { + return getEntryByPath(entry_index_type(r.second)); + } + auto& header = m_impl->getFileheader(); + if (!header.hasMainPage()) { + throw EntryNotFound("No main page"); + } + return getEntryByPath(header.getMainPage()); + } + + bool Archive::hasMainEntry() const { + return m_impl->getFileheader().hasMainPage(); + } + + Entry Archive::getRandomEntry() const { + if ( !m_impl->hasNewNamespaceScheme() ) { + const auto startOfNamespaceA = m_impl->getNamespaceBeginOffset('A'); + const auto endOfNamespaceA = m_impl->getNamespaceEndOffset('A'); + const auto n = (endOfNamespaceA - startOfNamespaceA).v; + if ( n == 0 ) { + throw EntryNotFound("Cannot find valid random entry (empty namespace 'A'"); + } + return getEntryByPath(startOfNamespaceA.v + randomNumber(n-1)); + } else { + auto frontEntryCount = m_impl->getFrontEntryCount().v; + if (frontEntryCount == 0) { + throw EntryNotFound("Cannot find valid random entry (no front entry at all)"); + } + + return getEntryByTitle(randomNumber(frontEntryCount-1)); + } + } + + bool Archive::hasFulltextIndex() const { + auto r = m_impl->findx('X', "fulltext/xapian"); + if (!r.first) { + r = m_impl->findx('Z', "/fulltextIndex/xapian"); + } + if (!r.first) { + return false; + } + auto entry = Entry(m_impl, entry_index_type(r.second)); + auto item = entry.getItem(true); + auto accessInfo = item.getDirectAccessInformation(); + return accessInfo.second; + } + + bool Archive::hasTitleIndex() const { + auto r = m_impl->findx('X', "title/xapian"); + if (!r.first) { + return false; + } + auto entry = Entry(m_impl, entry_index_type(r.second)); + auto item = entry.getItem(true); + auto accessInfo = item.getDirectAccessInformation(); + return accessInfo.second; + } + + Archive::EntryRange Archive::iterByPath() const + { + return EntryRange(m_impl, m_impl->getStartUserEntry().v, m_impl->getEndUserEntry().v); + } + + Archive::EntryRange Archive::iterByTitle() const + { + if (m_impl->hasFrontArticlesIndex()) { + // We have a front articles index. We can "simply" loop over all front entries. + return EntryRange( + m_impl, + 0, + m_impl->getFrontEntryCount().v + ); + } else if (!m_impl->hasNewNamespaceScheme()) { + // We are a old zim archive with namespace, we have to iterate on 'A' namespace. + return EntryRange( + m_impl, + m_impl->getNamespaceBeginOffset('A').v, + m_impl->getNamespaceEndOffset('A').v + ); + } else { + // We are a zim archive without namespace but without specific articles listing. + // We don't the choice here, iterate on all user entries. + return EntryRange( + m_impl, + m_impl->getStartUserEntry().v, + m_impl->getEndUserEntry().v + ); + } + } + + Archive::EntryRange Archive::iterEfficient() const + { + return EntryRange(m_impl, 0, getEntryCount()); + } + + Archive::EntryRange Archive::findByPath(std::string path) const + { + // "url order" means that the entries are stored by long url ("NS/url)". + // + // If we really want to search by url whatever is the namespace, we would have to + // search in all "content" (A, I, J, -) namespaces and then merge the results. + // + // It would be pretty complex as we would need to have iterate hover several ranges + // in the same time. Let's enforce that path is the full path and search in whatever + // namespace is in it. + + // We have to return two iterator for a range of entry where `path` is a prefix. + // - The begin iterator is a iterator to the first entry with `path` as a prefix (or (range) end if none) + // - The end iterator is the iterator pass the last entry with `path` as a prefix (or (global) end) + // + // The findx return a iterator for the exact match or the one just after. + // So, for the begin iterator, we can simply use the index returned by findx + // For the end iterator we have to do the same but with a prefix "just after" the queried `path` + // So the end index will always be just after the prefix range. If there is no prefix range, both + // begin and end will be just after where it would be. + // + // Suposing a list of title : + // 0. aaaaaa + // 1. aaaaab + // 2. aabbaa + // 3. aabbbb + // 4. bbaaaa + // 5. bbbb + // 6. bbbbaa + // 7. bbbbbb + // 8. + + // If we search for prefix aabb, we must return 2/4 + // A findx with aabb will return 2 + // A findx with aabc will return 4 + // + // If we search for prefix bbbb, we must return 5/8 + // A findx with bbbb will return 5 (with exact match) + // A findx with bbbc will return 8 + // + // If we search for prefix cccc, we must return 8/8 + // A findx with cccc will return 8 + // A findx with bbbc will return 8 + // + // If we search for prefix a, we must return 0/4 + // A findx with a will return 0 + // A find with b will return 4 + entry_index_t begin_idx, end_idx; + if (path.empty() || path == "/") { + begin_idx = m_impl->getStartUserEntry(); + end_idx = m_impl->getEndUserEntry(); + } else if (m_impl->hasNewNamespaceScheme()) { + begin_idx = m_impl->findx('C', path).second; + path.back()++; + end_idx = m_impl->findx('C', path).second; + } else { + char ns; + try { + std::tie(ns, path) = parseLongPath(path); + } catch (...) { + return Archive::EntryRange(m_impl, 0, 0); + } + begin_idx = m_impl->findx(ns, path).second; + if (path.empty()) { + ns++; + } else { + path.back()++; + } + end_idx = m_impl->findx(ns, path).second; + } + return Archive::EntryRange(m_impl, begin_idx.v, end_idx.v); + } + + Archive::EntryRange Archive::findByTitle(std::string title) const + { + // "title order" means that the entries are stored by "NS/title" part. + // It is nice when we want to search for title in a specific namespace, but + // now we want to hide the namespace. It would be better if the "title order" + // would be real title order, whatever is the namespace. + // + // If we really want to search by title what ever is the namespace, we would have to + // search in all "content" namespace and then merge the results. + // + // The find by title is only used for the article (`A` namespace). So let's search + // only in it. + + // See `Archive::findByPath` for the rational. + auto ns = m_impl->hasNewNamespaceScheme() ? 'C' : 'A'; + auto begin_idx = m_impl->findxByTitle(ns, title).second; + title.back()++; + auto end_idx = m_impl->findxByTitle(ns, title).second; + return Archive::EntryRange(m_impl, begin_idx.v, end_idx.v); + } + + bool Archive::hasChecksum() const + { + return m_impl->getFileheader().hasChecksum(); + } + + std::string Archive::getChecksum() const + { + return m_impl->getChecksum(); + } + + bool Archive::check() const + { + return m_impl->verify(); + } + + bool Archive::isMultiPart() const + { + return m_impl->is_multiPart(); + } + + bool Archive::hasNewNamespaceScheme() const + { + return m_impl->hasNewNamespaceScheme(); + } + + cluster_index_type Archive::getClusterCount() const + { + return cluster_index_type(m_impl->getCountClusters()); + } + + offset_type Archive::getClusterOffset(cluster_index_type idx) const + { + return offset_type(m_impl->getClusterOffset(cluster_index_t(idx))); + } + + entry_index_type Archive::getMainEntryIndex() const + { + return m_impl->getFileheader().getMainPage(); + } + + template<> + entry_index_type + _toPathOrder(const FileImpl& impl, entry_index_type idx) + { + return idx; + } + + template<> + entry_index_type + _toPathOrder(const FileImpl& impl, entry_index_type idx) + { + return impl.getIndexByTitle(title_index_t(idx)).v; + } + + template<> + entry_index_type + _toPathOrder(const FileImpl& impl, entry_index_type idx) + { + return impl.getIndexByClusterOrder(entry_index_t(idx)).v; + } + + bool Archive::checkIntegrity(IntegrityCheck checkType) + { + return m_impl->checkIntegrity(checkType); + } + + bool validate(const std::string& zimPath, IntegrityCheckList checksToRun) + { + try + { + Archive a(zimPath); + for ( size_t i = 0; i < checksToRun.size(); ++i ) + { + if ( checksToRun.test(i) && !a.checkIntegrity(IntegrityCheck(i)) ) + return false; + } + } + catch(ZimFileFormatError &exception) + { + std::cerr << exception.what() << std::endl; + return false; + } + + return true; + } + +} // namespace zim diff --git a/src/blob.cpp b/src/blob.cpp new file mode 100644 index 0000000..39716c4 --- /dev/null +++ b/src/blob.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2020 Veloman Yunkan + * Copyright (C) 2017-2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + + +#include "zim/blob.h" +#include "debug.h" +#include "buffer.h" + +namespace zim { + +namespace +{ + +struct NoDelete +{ + template void operator()(T*) {} +}; + +// This shared_ptr is used as a source object for the std::shared_ptr +// aliasing constructor (with the purpose of avoiding the control block +// allocation) for the case when the referred data must not be deleted. +static Blob::DataPtr nonOwnedDataPtr((char*)nullptr, NoDelete()); + +} // unnamed namespace + + +Blob::Blob() + : _data(nonOwnedDataPtr), + _size(0) +{} + +Blob::Blob(const char* data, size_type size) + : _data(nonOwnedDataPtr, data), + _size(size) +{ + ASSERT(size, <, SIZE_MAX); + ASSERT(data, <, (void*)(SIZE_MAX-size)); +} + +Blob::Blob(const DataPtr& buffer, size_type size) + : _data(buffer), + _size(size) +{} + + + + +} //zim diff --git a/src/buffer.cpp b/src/buffer.cpp new file mode 100644 index 0000000..6cc7896 --- /dev/null +++ b/src/buffer.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2020 Veloman Yunkan + * Copyright (C) 2017-2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "buffer.h" + +#include +#include +#include +#include +#include +#include +#include + +#ifndef _WIN32 +# include +# include +#endif + +namespace zim { + +namespace +{ + +struct NoDelete +{ + template void operator()(T*) {} +}; + +// This shared_ptr is used as a source object for the std::shared_ptr +// aliasing constructor (with the purpose of avoiding the control block +// allocation) for the case when the referred data must not be deleted. +static Buffer::DataPtr nonOwnedDataPtr((char*)nullptr, NoDelete()); + +} // unnamed namespace + +const Buffer Buffer::sub_buffer(offset_t offset, zsize_t size) const +{ + ASSERT(offset.v, <=, m_size.v); + ASSERT(offset.v+size.v, <=, m_size.v); + auto sub_data = DataPtr(m_data, data(offset)); + return Buffer(sub_data, size); +} + +const Buffer Buffer::makeBuffer(const DataPtr& data, zsize_t size) +{ + return Buffer(data, size); +} + +const Buffer Buffer::makeBuffer(const char* data, zsize_t size) +{ + return Buffer(DataPtr(nonOwnedDataPtr, data), size); +} + +Buffer Buffer::makeBuffer(zsize_t size) +{ + if (0 == size.v) { + return Buffer(DataPtr(nonOwnedDataPtr, nullptr), size); + } + return Buffer(DataPtr(new char[size.v], std::default_delete()), size); +} + +Buffer::Buffer(const DataPtr& data, zsize_t size) + : m_size(size), + m_data(data) +{ + ASSERT(m_size.v, <, SIZE_MAX); +} + +const char* +Buffer::data(offset_t offset) const { + ASSERT(offset.v, <=, m_size.v); + return m_data.get() + offset.v; +} + +} //zim diff --git a/src/buffer.h b/src/buffer.h new file mode 100644 index 0000000..b14e609 --- /dev/null +++ b/src/buffer.h @@ -0,0 +1,65 @@ +/* + * Copyright (C) 2020 Veloman Yunkan + * Copyright (C) 2017-2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_BUFFER_H_ +#define ZIM_BUFFER_H_ + +#include +#include +#include +#include + +#include "config.h" +#include "zim_types.h" +#include "endian_tools.h" +#include "debug.h" +#include + +namespace zim { + +class Buffer { + public: // types + typedef std::shared_ptr DataPtr; + + public: // functions + static const Buffer makeBuffer(const char* data, zsize_t size); + static const Buffer makeBuffer(const DataPtr& data, zsize_t size); + static Buffer makeBuffer(zsize_t size); + + const char* data(offset_t offset=offset_t(0)) const; + + char at(offset_t offset) const { + return *(data(offset)); + } + zsize_t size() const { return m_size; } + const Buffer sub_buffer(offset_t offset, zsize_t size) const; + operator Blob() const { return Blob(m_data, m_size.v); } + + private: // functions + Buffer(const DataPtr& data, zsize_t size); + + private: // data + zsize_t m_size; + DataPtr m_data; +}; + +} // zim namespace + +#endif //ZIM_BUFFER_H_ diff --git a/src/buffer_reader.cpp b/src/buffer_reader.cpp new file mode 100644 index 0000000..374d4da --- /dev/null +++ b/src/buffer_reader.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include "buffer_reader.h" +#include "buffer.h" + +#include + +namespace zim { + +const Buffer BufferReader::get_buffer(offset_t offset, zsize_t size) const +{ + return source.sub_buffer(offset, size); +} + +std::unique_ptr BufferReader::sub_reader(offset_t offset, zsize_t size) const +{ + auto sub_buff = get_buffer(offset, size); + std::unique_ptr sub_read(new BufferReader(sub_buff)); + return sub_read; +} + +zsize_t BufferReader::size() const +{ + return source.size(); +} + +offset_t BufferReader::offset() const +{ + return offset_t((offset_type)(static_cast(source.data(offset_t(0))))); +} + + +void BufferReader::read(char* dest, offset_t offset, zsize_t size) const { + ASSERT(offset.v, <=, source.size().v); + ASSERT(offset+offset_t(size.v), <=, offset_t(source.size().v)); + if (! size ) { + return; + } + memcpy(dest, source.data(offset), size.v); +} + + +char BufferReader::read(offset_t offset) const { + ASSERT(offset.v, <, source.size().v); + char dest; + dest = *source.data(offset); + return dest; +} + + +} // zim diff --git a/src/buffer_reader.h b/src/buffer_reader.h new file mode 100644 index 0000000..938aecc --- /dev/null +++ b/src/buffer_reader.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2017 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_BUFFER_READER_H_ +#define ZIM_BUFFER_READER_H_ + +#include "reader.h" + +namespace zim { + +class BufferReader : public Reader { + public: + BufferReader(const Buffer& source) + : source(source) {} + virtual ~BufferReader() {}; + + zsize_t size() const; + offset_t offset() const; + + void read(char* dest, offset_t offset, zsize_t size) const; + char read(offset_t offset) const; + const Buffer get_buffer(offset_t offset, zsize_t size) const; + std::unique_ptr sub_reader(offset_t offset, zsize_t size) const; + + private: + const Buffer source; +}; + +}; + +#endif // ZIM_BUFFER_READER_H_ diff --git a/src/bufferstreamer.h b/src/bufferstreamer.h new file mode 100644 index 0000000..ff447d9 --- /dev/null +++ b/src/bufferstreamer.h @@ -0,0 +1,78 @@ +/* + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_BUFFERSTREAMER_H +#define ZIM_BUFFERSTREAMER_H + +#include "debug.h" + +#include + +namespace zim +{ + +class BufferStreamer +{ +public: // functions + BufferStreamer(const Buffer& buffer, zsize_t size) + : m_buffer(buffer), + m_current(buffer.data()), + m_size(size) + {} + + explicit BufferStreamer(const Buffer& buffer) + : BufferStreamer(buffer, buffer.size()) + {} + + // Reads a value of the said type from the stream + // + // For best portability this function should be used with types of known + // bit-width (int32_t, uint16_t, etc) rather than builtin types with + // unknown bit-width (int, unsigned, etc). + template T read() + { + const size_t N(sizeof(T)); + char buf[N]; + memcpy(buf, m_current, N); + skip(zsize_t(N)); + return fromLittleEndian(buf); // XXX: This handles only integral types + } + + const char* current() const { + return m_current; + } + + zsize_t left() const { + return m_size; + } + + void skip(zsize_t nbBytes) { + m_current += nbBytes.v; + m_size -= nbBytes; + } + +private: // data + const Buffer m_buffer; + const char* m_current; + zsize_t m_size; +}; + +} // namespace zim + +#endif // ZIM_BUFDATASTREAM_H diff --git a/src/cluster.cpp b/src/cluster.cpp new file mode 100644 index 0000000..32afd8d --- /dev/null +++ b/src/cluster.cpp @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2016-2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "cluster.h" +#include +#include +#include "buffer_reader.h" +#include "endian_tools.h" +#include "bufferstreamer.h" +#include "decoderstreamreader.h" +#include "rawstreamreader.h" +#include +#include +#include + +#include "compression.h" +#include "log.h" + +#include "config.h" + +log_define("zim.cluster") + +#define log_debug1(e) + +namespace zim +{ + +namespace +{ + +std::unique_ptr +getClusterReader(const Reader& zimReader, offset_t offset, Cluster::Compression* comp, bool* extended) +{ + uint8_t clusterInfo = zimReader.read(offset); + // Very old zim files used 0 as a "default" compression, which means no compression. + uint8_t compInfo = clusterInfo & 0x0F; + if(compInfo == 0) { + *comp = Cluster::Compression::None; + } else if (compInfo == int(Cluster::Compression::Zip)) { + throw std::runtime_error("zlib not enabled in this library"); + } else if (compInfo == int(Cluster::Compression::Bzip2)) { + throw std::runtime_error("bzip2 not enabled in this library"); + } else { + *comp = static_cast(compInfo); + } + *extended = clusterInfo & 0x10; + auto subReader = std::shared_ptr(zimReader.sub_reader(offset+offset_t(1))); + + switch ( *comp ) { + case Cluster::Compression::None: + return std::unique_ptr(new RawStreamReader(subReader)); + case Cluster::Compression::Lzma: + return std::unique_ptr(new DecoderStreamReader(subReader)); + case Cluster::Compression::Zstd: + return std::unique_ptr(new DecoderStreamReader(subReader)); + default: + throw ZimFileFormatError("Invalid compression flag"); + } +} + +} // unnamed namespace + + std::shared_ptr Cluster::read(const Reader& zimReader, offset_t clusterOffset) + { + Compression comp; + bool extended; + auto reader = getClusterReader(zimReader, clusterOffset, &comp, &extended); + return std::make_shared(std::move(reader), comp, extended); + } + + Cluster::Cluster(std::unique_ptr reader_, Compression comp, bool isExtended) + : compression(comp), + isExtended(isExtended), + m_reader(std::move(reader_)) + { + if (isExtended) { + read_header(); + } else { + read_header(); + } + } + + /* This return the number of char read */ + template + void Cluster::read_header() + { + // read first offset, which specifies, how many offsets we need to read + OFFSET_TYPE offset = m_reader->read(); + + size_t n_offset = offset / sizeof(OFFSET_TYPE); + const offset_t data_address(offset); + + // read offsets + m_blobOffsets.clear(); + m_blobOffsets.reserve(n_offset); + m_blobOffsets.push_back(offset_t(offset)); + + // Get the whole offsets data to avoid to many (system) call. + auto bufferSize = zsize_t(offset-sizeof(OFFSET_TYPE)); + auto buffer = m_reader->sub_reader(bufferSize)->get_buffer(offset_t(0), bufferSize); + auto seqReader = BufferStreamer(buffer, bufferSize); + while (--n_offset) + { + OFFSET_TYPE new_offset = seqReader.read(); + ASSERT(new_offset, >=, offset); + + m_blobOffsets.push_back(offset_t(new_offset)); + offset = new_offset; + } + } + + zsize_t Cluster::getBlobSize(blob_index_t n) const + { + if (blob_index_type(n)+1 >= m_blobOffsets.size()) { + throw ZimFileFormatError("blob index out of range"); + } + return zsize_t(m_blobOffsets[blob_index_type(n)+1].v - m_blobOffsets[blob_index_type(n)].v); + } + + const Reader& Cluster::getReader(blob_index_t n) const + { + std::lock_guard lock(m_readerAccessMutex); + for(blob_index_type current(m_blobReaders.size()); current<=n.v; ++current) { + auto blobSize = getBlobSize(blob_index_t(current)); + if (blobSize.v > SIZE_MAX) { + m_blobReaders.push_back(std::unique_ptr(new BufferReader(Buffer::makeBuffer(zsize_t(0))))); + } else { + m_blobReaders.push_back(m_reader->sub_reader(blobSize)); + } + } + return *m_blobReaders[blob_index_type(n)]; + } + + Blob Cluster::getBlob(blob_index_t n) const + { + if (n < count()) { + const auto blobSize = getBlobSize(n); + if (blobSize.v > SIZE_MAX) { + return Blob(); + } + return getReader(n).get_buffer(offset_t(0), blobSize); + } else { + return Blob(); + } + } + + Blob Cluster::getBlob(blob_index_t n, offset_t offset, zsize_t size) const + { + if (n < count()) { + const auto blobSize = getBlobSize(n); + if ( offset.v > blobSize.v ) { + return Blob(); + } + size = std::min(size, zsize_t(blobSize.v-offset.v)); + if (size.v > SIZE_MAX) { + return Blob(); + } + return getReader(n).get_buffer(offset, size); + } else { + return Blob(); + } + } + +} diff --git a/src/cluster.h b/src/cluster.h new file mode 100644 index 0000000..e8c9662 --- /dev/null +++ b/src/cluster.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2016-2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * Copyright (C) 2020 Miguel Rocha + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_CLUSTER_H +#define ZIM_CLUSTER_H + +#include +#include "buffer.h" +#include "zim_types.h" +#include "file_reader.h" +#include +#include +#include +#include + +#include "zim_types.h" +#include "zim/error.h" + +namespace zim +{ + class Blob; + class Reader; + class IStreamReader; + + class Cluster : public std::enable_shared_from_this { + typedef std::vector BlobOffsets; + typedef std::vector> BlobReaders; + + public: + // zim::Compression lists only compression methods supported by the + // writer. But on the reader side we need to deal with some historical + // compression types. Here we maintain the full list of compression + // types. + enum class Compression + { + None = 1, + Zip, // Support is discontinued + Bzip2, // Support is discontinued + Lzma, // Supported only by the reader + Zstd + }; + + public: + const Compression compression; + const bool isExtended; + + private: + std::unique_ptr m_reader; + + // offsets of the blob boundaries relative to the start of the cluster data + // (*after* the first byte (clusterInfo)) + // For a cluster with N blobs, this collection contains N+1 entries. + // The start of the first blob and the end of the last blob are included. + BlobOffsets m_blobOffsets; + + mutable std::mutex m_readerAccessMutex; + mutable BlobReaders m_blobReaders; + + + template + void read_header(); + const Reader& getReader(blob_index_t n) const; + + public: + Cluster(std::unique_ptr reader, Compression comp, bool isExtended); + Compression getCompression() const { return compression; } + bool isCompressed() const { return compression != Compression::None; } + + blob_index_t count() const { return blob_index_t(m_blobOffsets.size() - 1); } + + zsize_t getBlobSize(blob_index_t n) const; + + offset_t getBlobOffset(blob_index_t n) const { return offset_t(1) + m_blobOffsets[blob_index_type(n)]; } + Blob getBlob(blob_index_t n) const; + Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const; + + static std::shared_ptr read(const Reader& zimReader, offset_t clusterOffset); + }; + +} + +#endif // ZIM_CLUSTER_H diff --git a/src/compression.cpp b/src/compression.cpp new file mode 100644 index 0000000..f145040 --- /dev/null +++ b/src/compression.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * Copyright (C) 2020 Emmanuel Engelhart + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the impliedD + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "compression.h" + +#include "envvalue.h" + +#include + +const std::string LZMA_INFO::name = "lzma"; +void LZMA_INFO::init_stream_decoder(stream_t* stream, char* raw_data) +{ + *stream = LZMA_STREAM_INIT; + unsigned memsize = zim::envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024); + auto errcode = lzma_stream_decoder(stream, memsize, 0); + if (errcode != LZMA_OK) { + throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream"); + } +} + +CompStatus LZMA_INFO::stream_run_decode(stream_t* stream, CompStep step) { + return stream_run(stream, step); +} + +CompStatus LZMA_INFO::stream_run(stream_t* stream, CompStep step) +{ + auto errcode = lzma_code(stream, step==CompStep::STEP?LZMA_RUN:LZMA_FINISH); + switch(errcode) { + case LZMA_BUF_ERROR: + return CompStatus::BUF_ERROR; + case LZMA_STREAM_END: + return CompStatus::STREAM_END; + case LZMA_OK: + return CompStatus::OK; + default: { + std::ostringstream ss; + ss << "Unexpected lzma status : " << errcode; + throw std::runtime_error(ss.str()); + } + } +} + +void LZMA_INFO::stream_end_decode(stream_t* stream) +{ + lzma_end(stream); +} + + +const std::string ZSTD_INFO::name = "zstd"; + +ZSTD_INFO::stream_t::stream_t() +: next_in(nullptr), + avail_in(0), + next_out(nullptr), + avail_out(0), + total_out(0), + encoder_stream(nullptr), + decoder_stream(nullptr) +{} + +ZSTD_INFO::stream_t::~stream_t() +{ + if ( encoder_stream ) + ::ZSTD_freeCStream(encoder_stream); + + if ( decoder_stream ) + ::ZSTD_freeDStream(decoder_stream); +} + +void ZSTD_INFO::init_stream_decoder(stream_t* stream, char* raw_data) +{ + stream->decoder_stream = ::ZSTD_createDStream(); + auto ret = ::ZSTD_initDStream(stream->decoder_stream); + if (::ZSTD_isError(ret)) { + throw std::runtime_error("Failed to initialize Zstd decompression"); + } +} + +void ZSTD_INFO::init_stream_encoder(stream_t* stream, char* raw_data) +{ + stream->encoder_stream = ::ZSTD_createCStream(); + auto ret = ::ZSTD_initCStream(stream->encoder_stream, 19); + if (::ZSTD_isError(ret)) { + throw std::runtime_error("Failed to initialize Zstd compression"); + } +} + +CompStatus ZSTD_INFO::stream_run_encode(stream_t* stream, CompStep step) { + ::ZSTD_inBuffer inBuf; + inBuf.src = stream->next_in; + inBuf.size = stream->avail_in; + inBuf.pos = 0; + + ::ZSTD_outBuffer outBuf; + outBuf.dst = stream->next_out; + outBuf.size = stream->avail_out; + outBuf.pos = 0; + + auto ret = step == CompStep::STEP + ? ::ZSTD_compressStream(stream->encoder_stream, &outBuf, &inBuf) + : ::ZSTD_endStream(stream->encoder_stream, &outBuf); + stream->next_in += inBuf.pos; + stream->avail_in -= inBuf.pos; + stream->next_out += outBuf.pos; + stream->avail_out -= outBuf.pos; + stream->total_out += outBuf.pos; + + if (::ZSTD_isError(ret)) { + throw std::runtime_error(::ZSTD_getErrorName(ret)); + } + + if ( step == CompStep::STEP ) { + if ( stream->avail_in != 0) { + ASSERT(stream->avail_out, ==, 0u); + return CompStatus::BUF_ERROR; + } + } else if ( ret > 0 ) { + return CompStatus::BUF_ERROR; + } + + return CompStatus::OK; +} + +CompStatus ZSTD_INFO::stream_run_decode(stream_t* stream, CompStep /*step*/) { + ::ZSTD_inBuffer inBuf; + inBuf.src = stream->next_in; + inBuf.size = stream->avail_in; + inBuf.pos = 0; + + ::ZSTD_outBuffer outBuf; + outBuf.dst = stream->next_out; + outBuf.size = stream->avail_out; + outBuf.pos = 0; + + auto ret = ::ZSTD_decompressStream(stream->decoder_stream, &outBuf, &inBuf); + stream->next_in += inBuf.pos; + stream->avail_in -= inBuf.pos; + stream->next_out += outBuf.pos; + stream->avail_out -= outBuf.pos; + stream->total_out += outBuf.pos; + + if (::ZSTD_isError(ret)) + throw std::runtime_error(::ZSTD_getErrorName(ret)); + + if (ret == 0) + return CompStatus::STREAM_END; + + return CompStatus::BUF_ERROR; +} + +void ZSTD_INFO::stream_end_decode(stream_t* stream) +{ +} + +void ZSTD_INFO::stream_end_encode(stream_t* stream) +{ +} diff --git a/src/compression.h b/src/compression.h new file mode 100644 index 0000000..c6e03e0 --- /dev/null +++ b/src/compression.h @@ -0,0 +1,293 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * Copyright (C) 2020 Emmanuel Engelhart + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef _LIBZIM_COMPRESSION_ +#define _LIBZIM_COMPRESSION_ + +#include +#include "string.h" + +#include "file_reader.h" +#include + +#include "config.h" + +#include +#include + +#include "zim_types.h" +#include "constants.h" + +//#define DEB(X) std::cerr << __func__ << " " << X << std::endl ; +#define DEB(X) + +enum class CompStep { + STEP, + FINISH +}; + +enum class CompStatus { + OK, + STREAM_END, + BUF_ERROR, +}; + +enum class RunnerStatus { + OK, + NEED_MORE, + ERROR +}; + +struct LZMA_INFO { + typedef lzma_stream stream_t; + static const std::string name; + static void init_stream_decoder(stream_t* stream, char* raw_data); + static CompStatus stream_run_decode(stream_t* stream, CompStep step); + static CompStatus stream_run(stream_t* stream, CompStep step); + static void stream_end_decode(stream_t* stream); +}; + + +struct ZSTD_INFO { + struct stream_t + { + const unsigned char* next_in; + size_t avail_in; + unsigned char* next_out; + size_t avail_out; + size_t total_out; + + ::ZSTD_CStream* encoder_stream; + ::ZSTD_DStream* decoder_stream; + + stream_t(); + ~stream_t(); + private: + stream_t(const stream_t& t) = delete; + void operator=(const stream_t& t) = delete; + }; + + static const std::string name; + static void init_stream_decoder(stream_t* stream, char* raw_data); + static void init_stream_encoder(stream_t* stream, char* raw_data); + static CompStatus stream_run_encode(stream_t* stream, CompStep step); + static CompStatus stream_run_decode(stream_t* stream, CompStep step); + static void stream_end_encode(stream_t* stream); + static void stream_end_decode(stream_t* stream); +}; + + +namespace zim { + +template +class Uncompressor +{ + public: + Uncompressor(size_t initial_size) : + ret_data(new char[initial_size]), + data_size(initial_size) + {} + ~Uncompressor() = default; + + void init(char* data) { + INFO::init_stream_decoder(&stream, data); + stream.next_out = (uint8_t*)ret_data.get(); + stream.avail_out = data_size; + } + + RunnerStatus feed(char* data, size_t size, CompStep step = CompStep::STEP) { + stream.next_in = (unsigned char*)data; + stream.avail_in = size; + while (true) { + auto errcode = INFO::stream_run_decode(&stream, step); + DEB((int)errcode) + switch (errcode) { + case CompStatus::BUF_ERROR: + if (stream.avail_in == 0 && stream.avail_out != 0) { + // End of input stream. + // compressor hasn't recognize the end of the input stream but there is + // no more input. + return RunnerStatus::NEED_MORE; + } else { + // Not enought output size. + // Allocate more memory and continue the loop. + DEB("need memory " << data_size << " " << stream.avail_out << " " << stream.total_out) + data_size *= 2; + std::unique_ptr new_ret_data(new char[data_size]); + memcpy(new_ret_data.get(), ret_data.get(), stream.total_out); + stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out); + stream.avail_out = data_size - stream.total_out; + DEB(data_size << " " << stream.avail_out << " " << stream.avail_in) + ret_data = std::move(new_ret_data); + } + break; + case CompStatus::OK: + // On first call where lzma cannot progress (no output size). + // Lzma return OK. If we return NEED_MORE, then we will try to compress + // with new input data, but we should not as current one is not processed. + // We must do a second step to have te BUF_ERROR and handle thing correctly. + // If we have no more input, then we must ask for more. + if (stream.avail_in == 0) { + return RunnerStatus::NEED_MORE; + } + break; + case CompStatus::STREAM_END: + // End of compressed stream. Everything is ok. + return RunnerStatus::OK; + default: + // unreachable + return RunnerStatus::ERROR; + } + }; + // unreachable + return RunnerStatus::NEED_MORE; + } + + std::unique_ptr get_data(zim::zsize_t* size) { + feed(nullptr, 0, CompStep::FINISH); + size->v = stream.total_out; + INFO::stream_end_decode(&stream); + return std::move(ret_data); + } + + private: + std::unique_ptr ret_data; + size_type data_size; + typename INFO::stream_t stream; +}; + +#define CHUNCK_SIZE ((zim::size_type)(1024)) +/** + * Uncompress data of the reader at startOffset. + * + * @param reader The reader where the data is. + * @param startOffset The offset where the data is in the reader. + * @param[out] dest_size The size of the uncompressed data. + * @return A pointer to the uncompressed data. This must be deleted (delete[]) +*/ +template +std::unique_ptr uncompress(const zim::Reader* reader, zim::offset_t startOffset, zim::zsize_t* dest_size) { + // Use a compressor to compress the data. + // As we don't know the result size, neither the compressed size, + // we have to do chunk by chunk until decompressor is happy. + // Let's assume it will be something like the default clusterSize used at creation + Uncompressor runner(DEFAULT_CLUSTER_SIZE); + // The input is a buffer of CHUNCK_SIZE char max. It may be less if the last chunk + // is at the end of the reader and the reader size is not a multiple of CHUNCK_SIZE. + std::vector raw_data(CHUNCK_SIZE); + + DEB("Init") + runner.init(raw_data.data()); + + zim::size_type availableSize = reader->size().v - startOffset.v; + auto ret = RunnerStatus::NEED_MORE; + while(ret != RunnerStatus::OK) { + if (ret == RunnerStatus::NEED_MORE and availableSize) { + zim::size_type inputSize = std::min(availableSize, CHUNCK_SIZE); + reader->read(raw_data.data(), startOffset, zim::zsize_t(inputSize)); + startOffset.v += inputSize; + availableSize -= inputSize; + DEB("Step " << startOffset.v) + ret = runner.feed(raw_data.data(), inputSize); + DEB("Ret " << (int)ret) + } + if (ret == RunnerStatus::ERROR) { + throw zim::ZimFileFormatError(std::string("Invalid ") + INFO::name + + std::string(" stream for cluster.")); + } + } + + DEB("Finish") + return runner.get_data(dest_size); +} + +template +class Compressor +{ + public: + Compressor(size_t initial_size=1024*1024) : + ret_data(new char[initial_size]), + ret_size(initial_size) + {} + + ~Compressor() = default; + + void init(char* data) { + INFO::init_stream_encoder(&stream, data); + stream.next_out = (uint8_t*)ret_data.get(); + stream.avail_out = ret_size; + } + + RunnerStatus feed(const char* data, size_t size, CompStep step=CompStep::STEP) { + stream.next_in = (unsigned char*)data; + stream.avail_in = size; + while (true) { + auto errcode = INFO::stream_run_encode(&stream, step); + switch (errcode) { + case CompStatus::OK: + if (stream.avail_out == 0) { + // lzma return a OK return status the first time it runs out of output memory. + // The BUF_ERROR is returned only the second time we call a lzma_code. + continue; + } else { + return RunnerStatus::NEED_MORE; + } + case CompStatus::STREAM_END: + return RunnerStatus::NEED_MORE; + case CompStatus::BUF_ERROR: + if (stream.avail_out == 0) { + //Not enought output size + ret_size *= 2; + std::unique_ptr new_ret_data(new char[ret_size]); + memcpy(new_ret_data.get(), ret_data.get(), stream.total_out); + stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out); + stream.avail_out = ret_size - stream.total_out; + ret_data = std::move(new_ret_data); + continue; + } else { + return RunnerStatus::ERROR; + } + break; + default: + // unreachable + return RunnerStatus::ERROR; + }; + }; + // urreachable + return RunnerStatus::NEED_MORE; + } + + std::unique_ptr get_data(zim::zsize_t* size) { + feed(nullptr, 0, CompStep::FINISH); + INFO::stream_end_encode(&stream); + size->v = stream.total_out; + return std::move(ret_data); + } + + private: + std::unique_ptr ret_data; + size_t ret_size; + typename INFO::stream_t stream; +}; + +} // namespace zim + +#endif // _LIBZIM_COMPRESSION_ diff --git a/src/concurrent_cache.h b/src/concurrent_cache.h new file mode 100644 index 0000000..0533d65 --- /dev/null +++ b/src/concurrent_cache.h @@ -0,0 +1,95 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_CONCURRENT_CACHE_H +#define ZIM_CONCURRENT_CACHE_H + +#include "lrucache.h" + +#include +#include + +namespace zim +{ + +/** + ConcurrentCache implements a concurrent thread-safe cache + + Compared to zim::lru_cache, each access operation is slightly more expensive. + However, different slots of the cache can be safely accessed concurrently + with minimal blocking. Concurrent access to the same element is also + safe, and, in case of a cache miss, will block until that element becomes + available. + */ +template +class ConcurrentCache +{ +private: // types + typedef std::shared_future ValuePlaceholder; + typedef lru_cache Impl; + +public: // types + explicit ConcurrentCache(size_t maxEntries) + : impl_(maxEntries) + {} + + // Gets the entry corresponding to the given key. If the entry is not in the + // cache, it is obtained by calling f() (without any arguments) and the + // result is put into the cache. + // + // The cache as a whole is locked only for the duration of accessing + // the respective slot. If, in the case of the a cache miss, the generation + // of the missing element takes a long time, only attempts to access that + // element will block - the rest of the cache remains open to concurrent + // access. + template + Value getOrPut(const Key& key, F f) + { + std::promise valuePromise; + std::unique_lock l(lock_); + const auto x = impl_.getOrPut(key, valuePromise.get_future().share()); + l.unlock(); + if ( x.miss() ) { + try { + valuePromise.set_value(f()); + } catch (std::exception& e) { + drop(key); + throw; + } + } + + return x.value().get(); + } + + bool drop(const Key& key) + { + std::unique_lock l(lock_); + return impl_.drop(key); + } + +private: // data + Impl impl_; + std::mutex lock_; +}; + +} // namespace zim + +#endif // ZIM_CONCURRENT_CACHE_H + diff --git a/src/config.h.in b/src/config.h.in new file mode 100644 index 0000000..77991c3 --- /dev/null +++ b/src/config.h.in @@ -0,0 +1,22 @@ + +#mesondefine VERSION + +#mesondefine DIRENT_CACHE_SIZE + +#mesondefine DIRENT_LOOKUP_CACHE_SIZE + +#mesondefine CLUSTER_CACHE_SIZE + +#mesondefine LZMA_MEMORY_SIZE + +#mesondefine ENABLE_XAPIAN + +#mesondefine ENABLE_USE_MMAP + +#mesondefine ENABLE_USE_BUFFER_HEADER + +#mesondefine MMAP_SUPPORT_64 + +#mesondefine ENV64BIT + +#mesondefine ENV32BIT diff --git a/src/constants.h b/src/constants.h new file mode 100644 index 0000000..2ed146c --- /dev/null +++ b/src/constants.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2021 Maneesh P M + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#define ANCHOR_TERM "0posanchor " + +#define DEFAULT_CLUSTER_SIZE 2*1024*1024 diff --git a/src/debug.h b/src/debug.h new file mode 100644 index 0000000..5bb96b8 --- /dev/null +++ b/src/debug.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2017-2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef DEBUG_H_ +#define DEBUG_H_ + +#include +#include +#include +#include + +#if defined (NDEBUG) +# define ASSERT(left, operator, right) (void(0)) +#else + +#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__EMSCRIPTEN__) && defined(__GNU_LIBRARY__) +#include +#endif + +template +void _on_assert_fail(const char* vara, const char* op, const char* varb, + T a, U b, const char* file, int line) { + std::ostringstream ss; + ss << "\nAssertion failed at "<< file << ":" << line << "\n " << + vara << "[" << a << "] " << op << " " << varb << "[" << b << "]"; + std::cerr << ss.str() << std::endl; + +#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__EMSCRIPTEN__) && defined(__GNU_LIBRARY__) + void *callstack[64]; + size_t size; + size = backtrace(callstack, 64); + char** strings = backtrace_symbols(callstack, size); + for (size_t i=0; i + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_DECODERSTREAMREADER_H +#define ZIM_DECODERSTREAMREADER_H + +#include "compression.h" +#include "istreamreader.h" + +namespace zim +{ + +template +class DecoderStreamReader : public IStreamReader +{ +private: // constants + enum { CHUNK_SIZE = 1024 }; + +public: // functions + DecoderStreamReader(std::shared_ptr inputReader) + : m_encodedDataReader(inputReader), + m_currentInputOffset(0), + m_inputBytesLeft(inputReader->size()), + m_encodedDataChunk(Buffer::makeBuffer(zsize_t(CHUNK_SIZE))) + { + Decoder::init_stream_decoder(&m_decoderState, nullptr); + readNextChunk(); + } + + ~DecoderStreamReader() + { + Decoder::stream_end_decode(&m_decoderState); + } + +private: // functions + void readNextChunk() + { + const auto n = std::min(zsize_t(CHUNK_SIZE), m_inputBytesLeft); + m_encodedDataChunk = m_encodedDataReader->get_buffer(m_currentInputOffset, n); + m_currentInputOffset += n; + m_inputBytesLeft -= n; + // XXX: ugly C-style cast (casting away constness) on the next line + m_decoderState.next_in = (unsigned char*)m_encodedDataChunk.data(); + m_decoderState.avail_in = m_encodedDataChunk.size().v; + } + + CompStatus decodeMoreBytes() + { + CompStep step = CompStep::STEP; + if ( m_decoderState.avail_in == 0 ) + { + if ( m_inputBytesLeft.v == 0 ) + step = CompStep::FINISH; + else + readNextChunk(); + } + + return Decoder::stream_run_decode(&m_decoderState, step); + } + + void readImpl(char* buf, zsize_t nbytes) override + { + m_decoderState.next_out = (unsigned char*)buf; + m_decoderState.avail_out = nbytes.v; + while ( m_decoderState.avail_out != 0 ) + { + // We don't car of the return code of decodeMoreBytes. + // We feed (or stop feeding) the decoder based on what + // we need to decode and the `avail_in`. + // If there is a error somehow, a exception will be thrown. + decodeMoreBytes(); + } + } + +private: // types + typedef typename Decoder::stream_t DecoderState; + +private: // data + std::shared_ptr m_encodedDataReader; + offset_t m_currentInputOffset; + zsize_t m_inputBytesLeft; // count of bytes left in the input stream + DecoderState m_decoderState; + Buffer m_encodedDataChunk; +}; + +} // namespace zim + +#endif // ZIM_DECODERSTREAMREADER_H diff --git a/src/dirent.cpp b/src/dirent.cpp new file mode 100644 index 0000000..9c26212 --- /dev/null +++ b/src/dirent.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (C) 2017-2020 Matthieu Gautier + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "_dirent.h" +#include "direntreader.h" +#include +#include +#include "buffer.h" +#include "bufferstreamer.h" +#include "endian_tools.h" +#include "log.h" +#include +#include + +log_define("zim.dirent") + +namespace zim +{ + ////////////////////////////////////////////////////////////////////// + // Dirent + // + + const uint16_t Dirent::redirectMimeType; + const uint16_t Dirent::linktargetMimeType; + const uint16_t Dirent::deletedMimeType; + + bool DirentReader::initDirent(Dirent& dirent, const Buffer& direntData) const + { + BufferStreamer reader(direntData); + uint16_t mimeType = reader.read(); + bool redirect = (mimeType == Dirent::redirectMimeType); + bool linktarget = (mimeType == Dirent::linktargetMimeType); + bool deleted = (mimeType == Dirent::deletedMimeType); + uint8_t extraLen = reader.read(); + char ns = reader.read(); + uint32_t version = reader.read(); + dirent.setVersion(version); + + if (redirect) + { + entry_index_type redirectIndex(reader.read()); + + log_debug("redirectIndex=" << redirectIndex); + + dirent.setRedirect(entry_index_t(redirectIndex)); + } + else if (linktarget || deleted) + { + log_debug("linktarget or deleted entry"); + dirent.setItem(mimeType, cluster_index_t(0), blob_index_t(0)); + } + else + { + log_debug("read article entry"); + + uint32_t clusterNumber = reader.read(); + uint32_t blobNumber = reader.read(); + + log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber); + + dirent.setItem(mimeType, cluster_index_t(clusterNumber), blob_index_t(blobNumber)); + } + + std::string url; + std::string title; + std::string parameter; + + log_debug("read url, title and parameters"); + + size_type url_size = strnlen( + reader.current(), + reader.left().v - extraLen + ); + if (url_size >= reader.left().v) { + return false; + } + url = std::string(reader.current(), url_size); + reader.skip(zsize_t(url_size+1)); + + size_type title_size = strnlen( + reader.current(), + reader.left().v - extraLen + ); + if (title_size >= reader.left().v) { + return false; + } + title = std::string(reader.current(), title_size); + reader.skip(zsize_t(title_size+1)); + + if (extraLen > reader.left().v) { + return false; + } + parameter = std::string(reader.current(), extraLen); + dirent.setUrl(ns, url); + dirent.setTitle(title); + dirent.setParameter(parameter); + return true; + } + + std::shared_ptr DirentReader::readDirent(offset_t offset) + { + const auto totalSize = mp_zimReader->size(); + if (offset.v >= totalSize.v) { + throw ZimFileFormatError("Invalid dirent pointer"); + } + + // We don't know the size of the dirent because it depends of the size of + // the title, url and extra parameters. + // This is a pity but we have no choice. + // We cannot take a buffer of the size of the file, it would be really + // inefficient. Let's do try, catch and retry while chosing a smart value + // for the buffer size. Most dirent will be "Article" entry (header's size + // == 16) without extra parameters. Let's hope that url + title size will + // be < 256 and if not try again with a bigger size. + + size_t bufferSize(std::min(size_type(256), mp_zimReader->size().v-offset.v)); + auto dirent = std::make_shared(); + std::lock_guard lock(m_bufferMutex); + for ( ; ; bufferSize += 256 ) { + m_buffer.reserve(bufferSize); + mp_zimReader->read(m_buffer.data(), offset, zsize_t(bufferSize)); + if ( initDirent(*dirent, Buffer::makeBuffer(m_buffer.data(), zsize_t(bufferSize))) ) + return dirent; + } + } + + std::string Dirent::getLongUrl() const + { + log_trace("Dirent::getLongUrl()"); + log_debug("namespace=" << getNamespace() << " title=" << getTitle()); + + return std::string(1, getNamespace()) + '/' + getUrl(); + } + +} diff --git a/src/dirent_accessor.cpp b/src/dirent_accessor.cpp new file mode 100644 index 0000000..73a8f42 --- /dev/null +++ b/src/dirent_accessor.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "dirent_accessor.h" + +#include "direntreader.h" +#include "_dirent.h" +#include "envvalue.h" + +#include + +#include + +using namespace zim; + +DirectDirentAccessor::DirectDirentAccessor(std::shared_ptr direntReader, std::unique_ptr urlPtrReader, entry_index_t direntCount) + : mp_direntReader(direntReader), + mp_urlPtrReader(std::move(urlPtrReader)), + m_direntCount(direntCount), + m_direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)), + m_bufferDirentZone(256) +{} + +std::shared_ptr DirectDirentAccessor::getDirent(entry_index_t idx) const +{ + { + std::lock_guard l(m_direntCacheLock); + auto v = m_direntCache.get(idx.v); + if (v.hit()) { + return v.value(); + } + } + + auto direntOffset = getOffset(idx); + auto dirent = readDirent(direntOffset); + std::lock_guard l(m_direntCacheLock); + m_direntCache.put(idx.v, dirent); + + return dirent; +} + +offset_t DirectDirentAccessor::getOffset(entry_index_t idx) const +{ + if (idx >= m_direntCount) { + throw std::out_of_range("entry index out of range"); + } + offset_t offset(mp_urlPtrReader->read_uint(offset_t(sizeof(offset_type)*idx.v))); + return offset; +} + +std::shared_ptr DirectDirentAccessor::readDirent(offset_t offset) const +{ + return mp_direntReader->readDirent(offset); +} + + +IndirectDirentAccessor::IndirectDirentAccessor(std::shared_ptr direntAccessor, std::unique_ptr indexReader, title_index_t direntCount) + : mp_direntAccessor(direntAccessor), + mp_indexReader(std::move(indexReader)), + m_direntCount(direntCount) +{} + +entry_index_t IndirectDirentAccessor::getDirectIndex(title_index_t idx) const +{ + if (idx >= m_direntCount) { + throw std::out_of_range("entry index out of range"); + } + entry_index_t index(mp_indexReader->read_uint(offset_t(sizeof(entry_index_t)*idx.v))); + return index; +} + +std::shared_ptr IndirectDirentAccessor::getDirent(title_index_t idx) const +{ + auto directIndex = getDirectIndex(idx); + return mp_direntAccessor->getDirent(directIndex); +} diff --git a/src/dirent_accessor.h b/src/dirent_accessor.h new file mode 100644 index 0000000..501e9b6 --- /dev/null +++ b/src/dirent_accessor.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_DIRENT_ACCESSOR_H +#define ZIM_DIRENT_ACCESSOR_H + +#include "zim_types.h" +#include "debug.h" +#include "lrucache.h" + +#include +#include +#include + +namespace zim +{ + +class Dirent; +class Reader; +class DirentReader; + +/** + * DirectDirentAccessor is used to access a dirent from its index. + * It doesn't provide any "advanced" features like lookup or find. + * + * This is the base class to locate a dirent (offset) and read it. + * + */ + +class DirectDirentAccessor +{ +public: // functions + DirectDirentAccessor(std::shared_ptr direntReader, std::unique_ptr urlPtrReader, entry_index_t direntCount); + + offset_t getOffset(entry_index_t idx) const; + std::shared_ptr getDirent(entry_index_t idx) const; + entry_index_t getDirentCount() const { return m_direntCount; } + +private: // functions + std::shared_ptr readDirent(offset_t) const; + +private: // data + std::shared_ptr mp_direntReader; + std::unique_ptr mp_urlPtrReader; + entry_index_t m_direntCount; + + mutable lru_cache> m_direntCache; + mutable std::mutex m_direntCacheLock; + + mutable std::vector m_bufferDirentZone; + mutable std::mutex m_bufferDirentLock; +}; + +class IndirectDirentAccessor +{ + public: + IndirectDirentAccessor(std::shared_ptr, std::unique_ptr indexReader, title_index_t direntCount); + + entry_index_t getDirectIndex(title_index_t idx) const; + std::shared_ptr getDirent(title_index_t idx) const; + title_index_t getDirentCount() const { return m_direntCount; } + + private: // data + std::shared_ptr mp_direntAccessor; + std::unique_ptr mp_indexReader; + title_index_t m_direntCount; +}; + +} // namespace zim + +#endif // ZIM_DIRENT_ACCESSOR_H diff --git a/src/dirent_lookup.h b/src/dirent_lookup.h new file mode 100644 index 0000000..5daefa4 --- /dev/null +++ b/src/dirent_lookup.h @@ -0,0 +1,250 @@ +/* + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_DIRENT_LOOKUP_H +#define ZIM_DIRENT_LOOKUP_H + +#include "zim_types.h" +#include "debug.h" +#include "narrowdown.h" + +#include +#include +#include +#include +#include + +namespace zim +{ + +template +class DirentLookup +{ +public: // types + typedef typename TConfig::DirentAccessorType DirentAccessor; + typedef typename TConfig::index_t index_t; + typedef std::pair Result; + +public: // functions + explicit DirentLookup(const DirentAccessor* _direntAccessor); + + index_t getNamespaceRangeBegin(char ns) const; + index_t getNamespaceRangeEnd(char ns) const; + + Result find(char ns, const std::string& key) const; + +protected: // functions + int compareWithDirentAt(char ns, const std::string& key, entry_index_type i) const; + Result findInRange(entry_index_type l, entry_index_type u, char ns, const std::string& key) const; + Result binarySearchInRange(entry_index_type l, entry_index_type u, char ns, const std::string& key) const; + +protected: // types + typedef std::map NamespaceBoundaryCache; + +protected: // data + const DirentAccessor& direntAccessor; + const entry_index_type direntCount; + + mutable NamespaceBoundaryCache namespaceBoundaryCache; + mutable std::mutex cacheAccessMutex; +}; + +template +int DirentLookup::compareWithDirentAt(char ns, const std::string& key, entry_index_type i) const +{ + const auto dirent = direntAccessor.getDirent(index_t(i)); + return ns < dirent->getNamespace() ? -1 + : ns > dirent->getNamespace() ? 1 + : key.compare(TConfig::getDirentKey(*dirent)); +} + +template +class FastDirentLookup : public DirentLookup +{ + typedef DirentLookup BaseType; + typedef typename BaseType::DirentAccessor DirentAccessor; + typedef typename BaseType::index_t index_t; + +public: // functions + FastDirentLookup(const DirentAccessor* _direntAccessor, entry_index_type cacheEntryCount); + + typename BaseType::Result find(char ns, const std::string& key) const; + +private: // functions + std::string getDirentKey(entry_index_type i) const; + +private: // data + using BaseType::direntAccessor; + using BaseType::direntCount; + NarrowDown lookupGrid; +}; + +template +std::string +FastDirentLookup::getDirentKey(entry_index_type i) const +{ + const auto d = direntAccessor.getDirent(index_t(i)); + return d->getNamespace() + TConfig::getDirentKey(*d); +} + +template +DirentLookup::DirentLookup(const DirentAccessor* _direntAccessor) + : direntAccessor(*_direntAccessor) + , direntCount(direntAccessor.getDirentCount()) +{ +} + +template +FastDirentLookup::FastDirentLookup(const DirentAccessor* _direntAccessor, entry_index_type cacheEntryCount) + : BaseType(_direntAccessor) +{ + if ( direntCount ) + { + const entry_index_type step = std::max(1u, direntCount/cacheEntryCount); + for ( entry_index_type i = 0; i < direntCount-1; i += step ) + { + lookupGrid.add(getDirentKey(i), i, getDirentKey(i+1)); + } + lookupGrid.close(getDirentKey(direntCount - 1), direntCount - 1); + } +} + +template +entry_index_t getNamespaceBeginOffset(TDirentAccessor& direntAccessor, char ch) +{ + ASSERT(ch, >=, 32); + ASSERT(ch, <=, 127); + + entry_index_type lower = 0; + entry_index_type upper = entry_index_type(direntAccessor.getDirentCount()); + auto d = direntAccessor.getDirent(entry_index_t(0)); + while (upper - lower > 1) + { + entry_index_type m = lower + (upper - lower) / 2; + auto d = direntAccessor.getDirent(entry_index_t(m)); + if (d->getNamespace() >= ch) + upper = m; + else + lower = m; + } + + entry_index_t ret = entry_index_t(d->getNamespace() < ch ? upper : lower); + return ret; +} + +template +entry_index_t getNamespaceEndOffset(TDirentAccessor& direntAccessor, char ch) +{ + ASSERT(ch, >=, 32); + ASSERT(ch, <, 127); + return getNamespaceBeginOffset(direntAccessor, ch+1); +} + + + +template +typename DirentLookup::index_t +DirentLookup::getNamespaceRangeBegin(char ch) const +{ + ASSERT(ch, >=, 32); + ASSERT(ch, <=, 127); + + { + std::lock_guard lock(cacheAccessMutex); + const auto it = namespaceBoundaryCache.find(ch); + if (it != namespaceBoundaryCache.end()) + return it->second; + } + + auto ret = getNamespaceBeginOffset(direntAccessor, ch); + + std::lock_guard lock(cacheAccessMutex); + namespaceBoundaryCache[ch] = ret; + return ret; +} + +template +typename DirentLookup::index_t +DirentLookup::getNamespaceRangeEnd(char ns) const +{ + return getNamespaceRangeBegin(ns+1); +} + +template +typename DirentLookup::Result +FastDirentLookup::find(char ns, const std::string& key) const +{ + const auto r = lookupGrid.getRange(ns + key); + return BaseType::findInRange(r.begin, r.end, ns, key); +} + +template +typename DirentLookup::Result +DirentLookup::find(char ns, const std::string& key) const +{ + return findInRange(0, direntCount, ns, key); +} + +template +typename DirentLookup::Result +DirentLookup::findInRange(entry_index_type l, entry_index_type u, char ns, const std::string& key) const +{ + if ( l == u ) + return { false, index_t(l) }; + + const auto c = compareWithDirentAt(ns, key, l); + if ( c < 0 ) + return { false, index_t(l) }; + else if ( c == 0 ) + return { true, index_t(l) }; + + if ( compareWithDirentAt(ns, key, u-1) > 0 ) + return { false, index_t(u) }; + + return binarySearchInRange(l, u-1, ns, key); +} + +template +typename DirentLookup::Result +DirentLookup::binarySearchInRange(entry_index_type l, entry_index_type u, char ns, const std::string& key) const +{ + assert(l <= u && u < direntCount); + assert(compareWithDirentAt(ns, key, l) > 0); + assert(compareWithDirentAt(ns, key, u) <= 0); + // Invariant maintained by the binary search: + // (entry at l) < (query entry ns/key) <= (entry at u) + while (true) + { + // compute p as the **upward rounded** average of l and u + const entry_index_type p = l + (u - l + 1) / 2; + const int c = compareWithDirentAt(ns, key, p); + if (c <= 0) { // (entry at l) < ns/key <= (entry at p) <= (entry at u) + if ( u == p ) { + return { c == 0, index_t(u) }; + } + u = p; + } else { // (entry at l) < (entry at p) < ns/key <= (entry at u) + l = p; + } + } +} + +} // namespace zim + +#endif // ZIM_DIRENT_LOOKUP_H diff --git a/src/direntreader.h b/src/direntreader.h new file mode 100644 index 0000000..2dc84ed --- /dev/null +++ b/src/direntreader.h @@ -0,0 +1,56 @@ +/* + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_DIRENTREADER_H +#define ZIM_DIRENTREADER_H + +#include "_dirent.h" +#include "reader.h" + +#include +#include +#include + +namespace zim +{ + +// Unlke FileReader and MemoryReader (which read data from a file and memory, +// respectively), DirentReader is a helper class that reads Dirents (rather +// than from a Dirent). +class DirentReader +{ +public: // functions + explicit DirentReader(std::shared_ptr zimReader) + : mp_zimReader(zimReader) + {} + + std::shared_ptr readDirent(offset_t offset); + +private: // functions + bool initDirent(Dirent& dirent, const Buffer& direntData) const; + +private: // data + std::shared_ptr mp_zimReader; + std::vector m_buffer; + std::mutex m_bufferMutex; +}; + +} // namespace zim + +#endif // ZIM_DIRENTREADER_H diff --git a/src/endian_tools.h b/src/endian_tools.h new file mode 100644 index 0000000..e51a58c --- /dev/null +++ b/src/endian_tools.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ENDIAN_H +#define ENDIAN_H + +#include +#include +#include + +namespace zim +{ + +template +struct ToLittleEndianImpl; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint16_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + } +}; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint32_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + dst[2] = static_cast(v>>16); + dst[3] = static_cast(v>>24); +} +}; + +template +struct ToLittleEndianImpl{ + static void write(const T& d, char* dst) { + uint64_t v = static_cast(d); + dst[0] = static_cast(v); + dst[1] = static_cast(v>>8); + dst[2] = static_cast(v>>16); + dst[3] = static_cast(v>>24); + dst[4] = static_cast(v>>32); + dst[5] = static_cast(v>>40); + dst[6] = static_cast(v>>48); + dst[7] = static_cast(v>>56); + } +}; + +//////////////////////////////////////////////////////////////////////// +template +inline void toLittleEndian(T d, char* dst) +{ + ToLittleEndianImpl::write(d, dst); +} + +template +inline T fromLittleEndian(const char* ptr) +{ + T ret = 0; + for(size_t i=0; i(static_cast(ptr[i])) << (i*8)); + } + return ret; +} + +} + +#endif // ENDIAN_H + diff --git a/src/entry.cpp b/src/entry.cpp new file mode 100644 index 0000000..717d45e --- /dev/null +++ b/src/entry.cpp @@ -0,0 +1,94 @@ +/* + * Copyright (C) 2021 Renaud Gaudin + * Copyright (C) 2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include "_dirent.h" +#include "fileimpl.h" +#include "file_part.h" +#include "log.h" + +#include + +log_define("zim.entry") + +using namespace zim; + +Entry::Entry(std::shared_ptr file, entry_index_type idx) + : m_file(file), + m_idx(idx), + m_dirent(file->getDirent(entry_index_t(idx))) +{} + +std::string Entry::getTitle() const +{ + return m_dirent->getTitle(); +} + +std::string Entry::getPath() const +{ + if (m_file->hasNewNamespaceScheme()) { + return m_dirent->getUrl(); + } else { + return m_dirent->getLongUrl(); + } +} + +bool Entry::isRedirect() const +{ + return m_dirent->isRedirect(); +} + +Item Entry::getItem(bool follow) const +{ + if (isRedirect()) { + if (! follow) { + std::ostringstream sstream; + sstream << "Entry " << getPath() << " is a redirect entry."; + throw InvalidType(sstream.str()); + } + return getRedirect(); + } + + return Item(m_file, m_idx); +} + +Item Entry::getRedirect() const { + auto nextEntry = getRedirectEntry(); + auto watchdog = 50U; + while (nextEntry.isRedirect() && --watchdog) { + nextEntry = nextEntry.getRedirectEntry(); + } + return nextEntry.getItem(false); +} + +entry_index_type Entry::getRedirectEntryIndex() const { + if (!isRedirect()) { + std::ostringstream sstream; + sstream << "Entry " << getPath() << " is not a redirect entry."; + throw InvalidType(sstream.str()); + } + return m_dirent->getRedirectIndex().v; +} + +Entry Entry::getRedirectEntry() const { + return Entry(m_file, getRedirectEntryIndex()); +} diff --git a/src/envvalue.cpp b/src/envvalue.cpp new file mode 100644 index 0000000..1d5c64f --- /dev/null +++ b/src/envvalue.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +namespace zim +{ + unsigned envValue(const char* env, unsigned def) + { + const char* v = ::getenv(env); + if (v) + { + std::istringstream s(v); + s >> def; + } + return def; + } + + unsigned envMemSize(const char* env, unsigned def) + { + const char* v = ::getenv(env); + if (v) + { + char unit = '\0'; + std::istringstream s(v); + s >> def >> unit; + + switch (unit) + { + case 'k': + case 'K': def *= 1024; break; + case 'm': + case 'M': def *= 1024 * 1024; break; + case 'g': + case 'G': def *= 1024 * 1024 * 1024; break; + } + } + return def; + } +} + diff --git a/src/envvalue.h b/src/envvalue.h new file mode 100644 index 0000000..d6dffd4 --- /dev/null +++ b/src/envvalue.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_ENVVALUE_H +#define ZIM_ENVVALUE_H + +namespace zim +{ + unsigned envValue(const char* env, unsigned def); + unsigned envMemSize(const char* env, unsigned def); +} + +#endif // ZIM_ENVVALUE_H diff --git a/src/file_compound.cpp b/src/file_compound.cpp new file mode 100644 index 0000000..a8f6bf1 --- /dev/null +++ b/src/file_compound.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2020-2021 Veloman Yunkan + * Copyright (C) 2017-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "file_compound.h" +#include "buffer.h" + +#include +#include +#include +#include + +#ifdef _WIN32 +# include +#else +# include +#endif + +namespace zim { + +void FileCompound::addPart(FilePart* fpart) +{ + const Range newRange(offset_t(_fsize.v), offset_t((_fsize+fpart->size()).v)); + emplace(newRange, fpart); + _fsize += fpart->size(); +} + +FileCompound::FileCompound(const std::string& filename): + _filename(filename), + _fsize(0) +{ + try { + addPart(new FilePart(filename)); + } catch(...) { + int errnoSave = errno; + _fsize = zsize_t(0); + try { + for (char ch0 = 'a'; ch0 <= 'z'; ++ch0) + { + const std::string fname0 = filename + ch0; + for (char ch1 = 'a'; ch1 <= 'z'; ++ch1) + { + addPart(new FilePart(fname0 + ch1)); + } + } + } catch (...) { } + + if (empty()) + { + std::ostringstream msg; + msg << "error " << errnoSave << " opening file \"" << filename; + throw std::runtime_error(msg.str()); + } + } +} + +#ifndef _WIN32 +FileCompound::FileCompound(int fd): + _filename(), + _fsize(0) +{ + addPart(new FilePart(fd)); +} +#endif + +FileCompound::~FileCompound() { + for(auto it=begin(); it!=end(); it++) { + auto filepart = it->second; + delete filepart; + } +} + +time_t FileCompound::getMTime() const { + if (mtime || empty()) + return mtime; + + const char* fname = begin()->second->filename().c_str(); + + #if defined(HAVE_STAT64) && ! defined(__APPLE__) + struct stat64 st; + int ret = ::stat64(fname, &st); + #else + struct stat st; + int ret = ::stat(fname, &st); + #endif + if (ret != 0) + { + std::ostringstream msg; + msg << "stat failed with errno " << errno << " : " << strerror(errno); + throw std::runtime_error(msg.str()); + } + mtime = st.st_mtime; + + return mtime; +} + +} // zim diff --git a/src/file_compound.h b/src/file_compound.h new file mode 100644 index 0000000..47b036e --- /dev/null +++ b/src/file_compound.h @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2020-2021 Veloman Yunkan + * Copyright (C) 2017-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_COMPOUND_H_ +#define ZIM_FILE_COMPOUND_H_ + +#include "file_part.h" +#include "zim_types.h" +#include "debug.h" +#include +#include +#include + +namespace zim { + +struct Range { + Range(const offset_t min, const offset_t max) + : min(min), max(max) + { + // ASSERT(min, <, max); + } + + const offset_t min; + const offset_t max; +}; + +struct less_range : public std::binary_function< Range, Range, bool> +{ + bool operator()(const Range& lhs, const Range& rhs) const { + return lhs.min < rhs.min && lhs.max <= rhs.min; + } +}; + +class FileCompound : private std::map { + typedef std::map ImplType; + + public: // types + typedef const_iterator PartIterator; + typedef std::pair PartRange; + + public: // functions + explicit FileCompound(const std::string& filename); + +#ifndef _WIN32 + explicit FileCompound(int fd); +#endif + + ~FileCompound(); + + using ImplType::begin; + using ImplType::end; + + const std::string& filename() const { return _filename; } + zsize_t fsize() const { return _fsize; }; + time_t getMTime() const; + bool fail() const { return empty(); }; + bool is_multiPart() const { return size() > 1; }; + + PartIterator locate(offset_t offset) const { + const PartIterator partIt = lower_bound(Range(offset, offset)); + ASSERT(partIt != end(), ==, true); + return partIt; + } + + PartRange locate(offset_t offset, zsize_t size) const { +#if ! defined(__APPLE__) + return equal_range(Range(offset, offset+size)); +#else + // Workaround for https://github.com/openzim/libzim/issues/398 + // Under MacOS the implementation of std::map::equal_range() makes + // assumptions about the properties of the key comparison function and + // abuses the std::map requirement that it must contain unique keys. As + // a result, when a map m is queried with an element k that is + // equivalent to more than one keys present in m, + // m.equal_range(k).first may be different from m.lower_bound(k) (the + // latter one returning the correct result). + const Range queryRange(offset, offset+size); + return {lower_bound(queryRange), upper_bound(queryRange)}; +#endif // ! defined(__APPLE__) + } + + private: // functions + void addPart(FilePart* fpart); + + private: // data + std::string _filename; + zsize_t _fsize; + mutable time_t mtime; +}; + + +}; + + +#endif //ZIM_FILE_COMPOUND_H_ diff --git a/src/file_part.h b/src/file_part.h new file mode 100644 index 0000000..6362baf --- /dev/null +++ b/src/file_part.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2020-2021 Veloman Yunkan + * Copyright (C) 2017-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_PART_H_ +#define ZIM_FILE_PART_H_ + +#include +#include +#include + +#include + +#include "zim_types.h" +#include "fs.h" + +namespace zim { + +class FilePart { + typedef DEFAULTFS FS; + + public: + using FDSharedPtr = std::shared_ptr; + + public: + FilePart(const std::string& filename) : + m_filename(filename), + m_fhandle(std::make_shared(FS::openFile(filename))), + m_size(m_fhandle->getSize()) {} + +#ifndef _WIN32 + FilePart(int fd) : + FilePart(getFilePathFromFD(fd)) {} +#endif + + ~FilePart() = default; + const std::string& filename() const { return m_filename; }; + const FS::FD& fhandle() const { return *m_fhandle; }; + const FDSharedPtr& shareable_fhandle() const { return m_fhandle; }; + + zsize_t size() const { return m_size; }; + bool fail() const { return !m_size; }; + bool good() const { return bool(m_size); }; + + private: + const std::string m_filename; + FDSharedPtr m_fhandle; + zsize_t m_size; +}; + +}; + +#endif //ZIM_FILE_PART_H_ diff --git a/src/file_reader.cpp b/src/file_reader.cpp new file mode 100644 index 0000000..74cbc3a --- /dev/null +++ b/src/file_reader.cpp @@ -0,0 +1,293 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include "file_reader.h" +#include "file_compound.h" +#include "buffer.h" +#include +#include +#include +#include +#include +#include +#include + + +#ifndef _WIN32 +# include +# include +#endif + +#if defined(_MSC_VER) +# include +# include + typedef SSIZE_T ssize_t; +#endif + +namespace zim { + +//////////////////////////////////////////////////////////////////////////////// +// MultiPartFileReader +//////////////////////////////////////////////////////////////////////////////// + +MultiPartFileReader::MultiPartFileReader(std::shared_ptr source) + : MultiPartFileReader(source, offset_t(0), source->fsize()) {} + +MultiPartFileReader::MultiPartFileReader(std::shared_ptr source, offset_t offset, zsize_t size) + : source(source), + _offset(offset), + _size(size) +{ + ASSERT(offset.v, <=, source->fsize().v); + ASSERT(offset.v+size.v, <=, source->fsize().v); +} + +char MultiPartFileReader::read(offset_t offset) const { + ASSERT(offset.v, <, _size.v); + offset += _offset; + auto part_pair = source->locate(offset); + auto& fhandle = part_pair->second->fhandle(); + offset_t local_offset = offset - part_pair->first.min; + ASSERT(local_offset, <=, part_pair->first.max); + char ret; + try { + fhandle.readAt(&ret, zsize_t(1), local_offset); + } catch (std::runtime_error& e) { + //Error while reading. + std::ostringstream s; + s << "Cannot read a char.\n"; + s << " - File part is " << part_pair->second->filename() << "\n"; + s << " - File part size is " << part_pair->second->size().v << "\n"; + s << " - File part range is " << part_pair->first.min << "-" << part_pair->first.max << "\n"; + s << " - Reading offset at " << offset.v << "\n"; + s << " - local offset is " << local_offset.v << "\n"; + s << " - error is " << strerror(errno) << "\n"; + std::error_code ec(errno, std::generic_category()); + throw std::system_error(ec, s.str()); + }; + return ret; +} + +void MultiPartFileReader::read(char* dest, offset_t offset, zsize_t size) const { + ASSERT(offset.v, <=, _size.v); + ASSERT(offset.v+size.v, <=, _size.v); + if (! size ) { + return; + } + offset += _offset; + auto found_range = source->locate(offset, size); + for(auto current = found_range.first; current!=found_range.second; current++){ + auto part = current->second; + Range partRange = current->first; + offset_t local_offset = offset-partRange.min; + ASSERT(size.v, >, 0U); + zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-local_offset.v)); + try { + part->fhandle().readAt(dest, size_to_get, local_offset); + } catch (std::runtime_error& e) { + std::ostringstream s; + s << "Cannot read chars.\n"; + s << " - File part is " << part->filename() << "\n"; + s << " - File part size is " << part->size().v << "\n"; + s << " - File part range is " << partRange.min << "-" << partRange.max << "\n"; + s << " - size_to_get is " << size_to_get.v << "\n"; + s << " - total size is " << size.v << "\n"; + s << " - Reading offset at " << offset.v << "\n"; + s << " - local offset is " << local_offset.v << "\n"; + s << " - error is " << strerror(errno) << "\n"; + std::error_code ec(errno, std::generic_category()); + throw std::system_error(ec, s.str()); + }; + ASSERT(size_to_get, <=, size); + dest += size_to_get.v; + size -= size_to_get; + offset += size_to_get; + } + ASSERT(size.v, ==, 0U); +} + +#ifdef ENABLE_USE_MMAP +namespace +{ + +class MMapException : std::exception {}; + +char* +mmapReadOnly(int fd, offset_type offset, size_type size) +{ +#if defined(__APPLE__) || defined(__OpenBSD__) + const auto MAP_FLAGS = MAP_PRIVATE; +#elif defined(__FreeBSD__) + const auto MAP_FLAGS = MAP_PRIVATE|MAP_PREFAULT_READ; +#else + const auto MAP_FLAGS = MAP_PRIVATE|MAP_POPULATE; +#endif + + const auto p = (char*)mmap(NULL, size, PROT_READ, MAP_FLAGS, fd, offset); + if (p == MAP_FAILED ) + { + std::ostringstream s; + s << "Cannot mmap size " << size << " at off " << offset + << " : " << strerror(errno); + throw std::runtime_error(s.str()); + } + return p; +} + +Buffer::DataPtr +makeMmappedBuffer(int fd, offset_t offset, zsize_t size) +{ + const offset_type pageAlignedOffset(offset.v & ~(sysconf(_SC_PAGE_SIZE) - 1)); + const size_t alignmentAdjustment = offset.v - pageAlignedOffset; + size += alignmentAdjustment; + +#if !MMAP_SUPPORT_64 + if(pageAlignedOffset >= INT32_MAX) { + throw MMapException(); + } +#endif + char* const mmappedAddress = mmapReadOnly(fd, pageAlignedOffset, size.v); + const auto munmapDeleter = [mmappedAddress, size](char* ) { + munmap(mmappedAddress, size.v); + }; + + return Buffer::DataPtr(mmappedAddress+alignmentAdjustment, munmapDeleter); +} + +} // unnamed namespace +#endif // ENABLE_USE_MMAP + +const Buffer MultiPartFileReader::get_buffer(offset_t offset, zsize_t size) const { + ASSERT(size, <=, _size); +#ifdef ENABLE_USE_MMAP + try { + auto found_range = source->locate(_offset+offset, size); + auto first_part_containing_it = found_range.first; + if (++first_part_containing_it != found_range.second) { + throw MMapException(); + } + + // The range is in only one part + auto range = found_range.first->first; + auto part = found_range.first->second; + auto local_offset = offset + _offset - range.min; + ASSERT(size, <=, part->size()); + int fd = part->fhandle().getNativeHandle(); + return Buffer::makeBuffer(makeMmappedBuffer(fd, local_offset, size), size); + } catch(MMapException& e) +#endif + { + // The range is several part, or we are on Windows. + // We will have to do some memory copies :/ + // [TODO] Use Windows equivalent for mmap. + auto ret_buffer = Buffer::makeBuffer(size); + read(const_cast(ret_buffer.data()), offset, size); + return ret_buffer; + } +} + +bool Reader::can_read(offset_t offset, zsize_t size) const +{ + return (offset.v <= this->size().v && (offset.v+size.v) <= this->size().v); +} + + +std::unique_ptr MultiPartFileReader::sub_reader(offset_t offset, zsize_t size) const +{ + ASSERT(offset.v+size.v, <=, _size.v); + // TODO: can use a FileReader here if the new range fully belongs to a single part + return std::unique_ptr(new MultiPartFileReader(source, _offset+offset, size)); +} + +//////////////////////////////////////////////////////////////////////////////// +// FileReader +//////////////////////////////////////////////////////////////////////////////// + +FileReader::FileReader(FileHandle fh, offset_t offset, zsize_t size) + : _fhandle(fh) + , _offset(offset) + , _size(size) +{ +} + +char FileReader::read(offset_t offset) const +{ + ASSERT(offset.v, <, _size.v); + offset += _offset; + char ret; + try { + _fhandle->readAt(&ret, zsize_t(1), offset); + } catch (std::runtime_error& e) { + //Error while reading. + std::ostringstream s; + s << "Cannot read a char.\n"; + s << " - Reading offset at " << offset.v << "\n"; + s << " - error is " << strerror(errno) << "\n"; + std::error_code ec(errno, std::generic_category()); + throw std::system_error(ec, s.str()); + }; + return ret; +} + +void FileReader::read(char* dest, offset_t offset, zsize_t size) const +{ + ASSERT(offset.v, <=, _size.v); + ASSERT(offset.v+size.v, <=, _size.v); + if (! size ) { + return; + } + offset += _offset; + try { + _fhandle->readAt(dest, size, offset); + } catch (std::runtime_error& e) { + std::ostringstream s; + s << "Cannot read chars.\n"; + s << " - Reading offset at " << offset.v << "\n"; + s << " - size is " << size.v << "\n"; + s << " - error is " << strerror(errno) << "\n"; + std::error_code ec(errno, std::generic_category()); + throw std::system_error(ec, s.str()); + }; +} + +const Buffer FileReader::get_buffer(offset_t offset, zsize_t size) const +{ + ASSERT(size, <=, _size); +#ifdef ENABLE_USE_MMAP + offset += _offset; + int fd = _fhandle->getNativeHandle(); + return Buffer::makeBuffer(makeMmappedBuffer(fd, offset, size), size); +#else // We are on Windows. [TODO] Use Windows equivalent for mmap. + auto ret_buffer = Buffer::makeBuffer(size); + read(const_cast(ret_buffer.data()), offset, size); + return ret_buffer; +#endif +} + +std::unique_ptr +FileReader::sub_reader(offset_t offset, zsize_t size) const +{ + ASSERT(offset.v+size.v, <=, _size.v); + return std::unique_ptr(new FileReader(_fhandle, _offset + offset, size)); +} + +} // zim diff --git a/src/file_reader.h b/src/file_reader.h new file mode 100644 index 0000000..36c3a74 --- /dev/null +++ b/src/file_reader.h @@ -0,0 +1,81 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILE_READER_H_ +#define ZIM_FILE_READER_H_ + +#include "reader.h" +#include "fs.h" + +namespace zim { + +class FileCompound; + +class FileReader : public Reader { + public: // types + typedef std::shared_ptr FileHandle; + + public: // functions + explicit FileReader(FileHandle fh, offset_t offset, zsize_t size); + ~FileReader() = default; + + zsize_t size() const { return _size; }; + offset_t offset() const { return _offset; }; + + char read(offset_t offset) const; + void read(char* dest, offset_t offset, zsize_t size) const; + const Buffer get_buffer(offset_t offset, zsize_t size) const; + + std::unique_ptr sub_reader(offset_t offset, zsize_t size) const; + + private: // data + // The file handle is stored via a shared pointer so that it can be shared + // by a sub_reader (otherwise the file handle would be invalidated by + // FD destructor when the sub-reader is destroyed). + FileHandle _fhandle; + offset_t _offset; + zsize_t _size; +}; + +class MultiPartFileReader : public Reader { + public: + MultiPartFileReader(std::shared_ptr source); + ~MultiPartFileReader() {}; + + zsize_t size() const { return _size; }; + offset_t offset() const { return _offset; }; + + char read(offset_t offset) const; + void read(char* dest, offset_t offset, zsize_t size) const; + const Buffer get_buffer(offset_t offset, zsize_t size) const; + + std::unique_ptr sub_reader(offset_t offset, zsize_t size) const; + + private: + MultiPartFileReader(std::shared_ptr source, offset_t offset, zsize_t size); + + std::shared_ptr source; + offset_t _offset; + zsize_t _size; +}; + +}; + +#endif // ZIM_FILE_READER_H_ diff --git a/src/fileheader.cpp b/src/fileheader.cpp new file mode 100644 index 0000000..e985094 --- /dev/null +++ b/src/fileheader.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2017-2020 Mattieu Gautier + * Copyright (C) 2008 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fileheader.h" +#include +#include +#include +#include "log.h" +#include "endian_tools.h" +#include "reader.h" +#include "bufferstreamer.h" +#include "buffer.h" +#ifdef _WIN32 +# include "io.h" +#else +# include "unistd.h" +# define _write(fd, addr, size) ::write((fd), (addr), (size)) +#endif + +log_define("zim.file.header") + +namespace zim +{ + const uint32_t Fileheader::zimMagic = 0x044d495a; // ="ZIM^d" + const uint16_t Fileheader::zimOldMajorVersion = 5; + const uint16_t Fileheader::zimMajorVersion = 6; + const uint16_t Fileheader::zimMinorVersion = 1; + const offset_type Fileheader::size = 80; // This is also mimeListPos (so an offset) + + void Fileheader::write(int out_fd) const + { + char header[Fileheader::size]; + toLittleEndian(Fileheader::zimMagic, header); + toLittleEndian(getMajorVersion(), header + 4); + toLittleEndian(getMinorVersion(), header + 6); + std::copy(getUuid().data, getUuid().data + sizeof(Uuid), header + 8); + toLittleEndian(getArticleCount(), header + 24); + toLittleEndian(getClusterCount(), header + 28); + toLittleEndian(getUrlPtrPos(), header + 32); + toLittleEndian(getTitleIdxPos(), header + 40); + toLittleEndian(getClusterPtrPos(), header + 48); + toLittleEndian(getMimeListPos(), header + 56); + toLittleEndian(getMainPage(), header + 64); + toLittleEndian(getLayoutPage(), header + 68); + toLittleEndian(getChecksumPos(), header + 72); + + auto ret = _write(out_fd, header, Fileheader::size); + if (ret != Fileheader::size) { + std::cerr << "Error Writing" << std::endl; + std::cerr << "Ret is " << ret << std::endl; + perror("Error writing"); + throw std::runtime_error("Error writing"); + } + } + + void Fileheader::read(const Reader& reader) + { + auto buffer = reader.get_buffer(offset_t(0), zsize_t(Fileheader::size)); + auto seqReader = BufferStreamer(buffer); + uint32_t magicNumber = seqReader.read(); + if (magicNumber != Fileheader::zimMagic) + { + log_error("invalid magic number " << magicNumber << " found - " + << Fileheader::zimMagic << " expected"); + throw ZimFileFormatError("Invalid magic number"); + } + + uint16_t major_version = seqReader.read(); + if (major_version != zimOldMajorVersion && major_version != zimMajorVersion) + { + log_error("invalid zimfile major version " << major_version << " found - " + << Fileheader::zimMajorVersion << " expected"); + throw ZimFileFormatError("Invalid version"); + } + setMajorVersion(major_version); + + setMinorVersion(seqReader.read()); + + Uuid uuid; + std::copy(seqReader.current(), seqReader.current()+16, uuid.data); + seqReader.skip(zsize_t(16)); + setUuid(uuid); + + setArticleCount(seqReader.read()); + setClusterCount(seqReader.read()); + setUrlPtrPos(seqReader.read()); + setTitleIdxPos(seqReader.read()); + setClusterPtrPos(seqReader.read()); + setMimeListPos(seqReader.read()); + setMainPage(seqReader.read()); + setLayoutPage(seqReader.read()); + setChecksumPos(seqReader.read()); + + sanity_check(); + } + + void Fileheader::sanity_check() const { + if (!!articleCount != !!clusterCount) { + throw ZimFileFormatError("No article <=> No cluster"); + } + + if (mimeListPos != size && mimeListPos != 72) { + throw ZimFileFormatError("mimelistPos must be 80."); + } + + if (urlPtrPos < mimeListPos) { + throw ZimFileFormatError("urlPtrPos must be > mimelistPos."); + } + if (titleIdxPos < mimeListPos) { + throw ZimFileFormatError("titleIdxPos must be > mimelistPos."); + } + if (clusterPtrPos < mimeListPos) { + throw ZimFileFormatError("clusterPtrPos must be > mimelistPos."); + } + + if (clusterCount > articleCount) { + throw ZimFileFormatError("Cluster count cannot be higher than article count."); + } + + if (checksumPos != 0 && checksumPos < mimeListPos) { + throw ZimFileFormatError("checksumPos must be > mimeListPos."); + } + } + +} diff --git a/src/fileheader.h b/src/fileheader.h new file mode 100644 index 0000000..95be691 --- /dev/null +++ b/src/fileheader.h @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2017-2020 Matthieu Gautier + * Copyright (C) 2008 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILEHEADER_H +#define ZIM_FILEHEADER_H + +#include +#include +#include +#include +#include + +// max may be defined as a macro by window includes +#ifdef max +#undef max +#endif + +namespace zim +{ + class Reader; + class Fileheader + { + public: + static const uint32_t zimMagic; + static const uint16_t zimOldMajorVersion; + static const uint16_t zimMajorVersion; + static const uint16_t zimMinorVersion; + static const size_type size; + + private: + uint16_t majorVersion; + uint16_t minorVersion; + Uuid uuid; + entry_index_type articleCount; + offset_type titleIdxPos; + offset_type urlPtrPos; + offset_type mimeListPos; + cluster_index_type clusterCount; + offset_type clusterPtrPos; + entry_index_type mainPage; + entry_index_type layoutPage; + offset_type checksumPos; + + public: + Fileheader() + : majorVersion(zimMajorVersion), + minorVersion(zimMinorVersion), + articleCount(0), + titleIdxPos(0), + urlPtrPos(0), + clusterCount(0), + clusterPtrPos(0), + mainPage(std::numeric_limits::max()), + layoutPage(std::numeric_limits::max()), + checksumPos(std::numeric_limits::max()) + {} + + void write(int out_fd) const; + void read(const Reader& reader); + + // Do some sanity check, raise a ZimFileFormateError is + // something is wrong. + void sanity_check() const; + + uint16_t getMajorVersion() const { return majorVersion; } + void setMajorVersion(uint16_t v) { majorVersion = v; } + + uint16_t getMinorVersion() const { return minorVersion; } + void setMinorVersion(uint16_t v) { minorVersion = v; } + + const Uuid& getUuid() const { return uuid; } + void setUuid(const Uuid& uuid_) { uuid = uuid_; } + + entry_index_type getArticleCount() const { return articleCount; } + void setArticleCount(entry_index_type s) { articleCount = s; } + + offset_type getTitleIdxPos() const { return titleIdxPos; } + void setTitleIdxPos(offset_type p) { titleIdxPos = p; } + + offset_type getUrlPtrPos() const { return urlPtrPos; } + void setUrlPtrPos(offset_type p) { urlPtrPos = p; } + + offset_type getMimeListPos() const { return mimeListPos; } + void setMimeListPos(offset_type p) { mimeListPos = p; } + + cluster_index_type getClusterCount() const { return clusterCount; } + void setClusterCount(cluster_index_type s) { clusterCount = s; } + + offset_type getClusterPtrPos() const { return clusterPtrPos; } + void setClusterPtrPos(offset_type p) { clusterPtrPos = p; } + + bool hasMainPage() const { return mainPage != std::numeric_limits::max(); } + entry_index_type getMainPage() const { return mainPage; } + void setMainPage(entry_index_type s){ mainPage = s; } + + bool hasLayoutPage() const { return layoutPage != std::numeric_limits::max(); } + entry_index_type getLayoutPage() const { return layoutPage; } + void setLayoutPage(entry_index_type s) { layoutPage = s; } + + bool hasChecksum() const { return getMimeListPos() >= 80; } + offset_type getChecksumPos() const { return hasChecksum() ? checksumPos : 0; } + void setChecksumPos(offset_type p) { checksumPos = p; } + }; + +} + +#endif // ZIM_FILEHEADER_H diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp new file mode 100644 index 0000000..1593ba6 --- /dev/null +++ b/src/fileimpl.cpp @@ -0,0 +1,676 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2020-2021 Veloman Yunkan + * Copyright (C) 2006,2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fileimpl.h" +#include +#include "_dirent.h" +#include "file_compound.h" +#include "buffer_reader.h" +#include +#include +#include +#include +#include +#include +#include "config.h" +#include "log.h" +#include "envvalue.h" +#include "md5.h" +#include "tools.h" + +log_define("zim.file.impl") + +namespace zim +{ + +namespace +{ + +offset_t readOffset(const Reader& reader, entry_index_type idx) +{ + offset_t offset(reader.read_uint(offset_t(sizeof(offset_type)*idx))); + return offset; +} + +std::unique_ptr +sectionSubReader(const Reader& zimReader, const std::string& sectionName, + offset_t offset, zsize_t size) +{ + if (!zimReader.can_read(offset, size)) { + throw ZimFileFormatError(sectionName + " outside (or not fully inside) ZIM file."); + } +#ifdef ENABLE_USE_BUFFER_HEADER + const auto buf = zimReader.get_buffer(offset, size); + return std::unique_ptr(new BufferReader(buf)); +#else + return zimReader.sub_reader(offset, size); +#endif +} + +std::shared_ptr +makeFileReader(std::shared_ptr zimFile, offset_t offset, zsize_t size) +{ + if (zimFile->fail()) { + return nullptr; + } else if ( zimFile->is_multiPart() ) { + ASSERT(offset.v, ==, 0u); + ASSERT(size, ==, zimFile->fsize()); + return std::make_shared(zimFile); + } else { + const auto& firstAndOnlyPart = zimFile->begin()->second; + return std::make_shared(firstAndOnlyPart->shareable_fhandle(), offset, size); + } +} + +} //unnamed namespace + + ////////////////////////////////////////////////////////////////////// + // FileImpl + // + FileImpl::FileImpl(const std::string& fname) + : FileImpl(std::make_shared(fname)) + {} + +#ifndef _WIN32 + FileImpl::FileImpl(int fd) + : FileImpl(std::make_shared(fd)) + {} + + FileImpl::FileImpl(int fd, offset_t offset, zsize_t size) + : FileImpl(std::make_shared(fd), offset, size) + {} +#endif + + FileImpl::FileImpl(std::shared_ptr _zimFile) + : FileImpl(_zimFile, offset_t(0), _zimFile->fsize()) + {} + + FileImpl::FileImpl(std::shared_ptr _zimFile, offset_t offset, zsize_t size) + : zimFile(_zimFile), + archiveStartOffset(offset), + zimReader(makeFileReader(zimFile, offset, size)), + direntReader(new DirentReader(zimReader)), + clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)), + m_newNamespaceScheme(false), + m_hasFrontArticlesIndex(true), + m_startUserEntry(0), + m_endUserEntry(0) + { + log_trace("read file \"" << zimFile->filename() << '"'); + + if (zimFile->fail()) + throw ZimFileFormatError(std::string("can't open zim-file \"") + zimFile->filename() + '"'); + + // read header + if (size_type(zimReader->size()) < Fileheader::size) { + throw ZimFileFormatError("zim-file is too small to contain a header"); + } + try { + header.read(*zimReader); + } catch (ZimFileFormatError& e) { + throw e; + } catch (...) { + throw ZimFileFormatError("error reading zim-file header."); + } + + auto urlPtrReader = sectionSubReader(*zimReader, + "Dirent pointer table", + offset_t(header.getUrlPtrPos()), + zsize_t(sizeof(offset_type)*header.getArticleCount())); + + mp_urlDirentAccessor.reset( + new DirectDirentAccessor(direntReader, std::move(urlPtrReader), entry_index_t(header.getArticleCount()))); + + + clusterOffsetReader = sectionSubReader(*zimReader, + "Cluster pointer table", + offset_t(header.getClusterPtrPos()), + zsize_t(sizeof(offset_type)*header.getClusterCount())); + + quickCheckForCorruptFile(); + + mp_titleDirentAccessor = getTitleAccessor("listing/titleOrdered/v1"); + + if (!mp_titleDirentAccessor) { + offset_t titleOffset(header.getTitleIdxPos()); + zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount()); + mp_titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Title index table"); + const_cast(m_hasFrontArticlesIndex) = false; + } + m_byTitleDirentLookup.reset(new ByTitleDirentLookup(mp_titleDirentAccessor.get())); + + readMimeTypes(); + } + + std::unique_ptr FileImpl::getTitleAccessor(const std::string& path) + { + auto result = direntLookup().find('X', path); + if (!result.first) { + return nullptr; + } + + auto dirent = mp_urlDirentAccessor->getDirent(result.second); + auto cluster = getCluster(dirent->getClusterNumber()); + if (cluster->isCompressed()) { + // This is a ZimFileFormatError. + // Let's be tolerent and skip the entry + return nullptr; + } + auto titleOffset = getClusterOffset(dirent->getClusterNumber()) + cluster->getBlobOffset(dirent->getBlobNumber()); + auto titleSize = cluster->getBlobSize(dirent->getBlobNumber()); + return getTitleAccessor(titleOffset, titleSize, "Title index table" + path); + } + + std::unique_ptr FileImpl::getTitleAccessor(const offset_t offset, const zsize_t size, const std::string& name) + { + auto titleIndexReader = sectionSubReader(*zimReader, + name, + offset, + size); + + return std::unique_ptr( + new IndirectDirentAccessor(mp_urlDirentAccessor, std::move(titleIndexReader), title_index_t(size.v/sizeof(entry_index_type)))); + } + + FileImpl::DirentLookup& FileImpl::direntLookup() const + { + // Not using std::call_once because it is buggy. + // 1. It doesn't play well with musl libc - an exception thrown by the + // callable results in SIGABRT even if there is a handler for it higher + // in the call stack. + // 2. With `glibc` an exceptional execution of `std::call_once` doesn't + // unlock the mutex associated with the `std::once_flag` object. + if ( !m_direntLookup ) { + std::lock_guard lock(m_direntLookupCreationMutex); + if ( !m_direntLookup ) { + const auto cacheSize = envValue("ZIM_DIRENTLOOKUPCACHE", DIRENT_LOOKUP_CACHE_SIZE); + m_direntLookup.reset(new DirentLookup(mp_urlDirentAccessor.get(), cacheSize)); + } + } + return *m_direntLookup; + } + + void FileImpl::quickCheckForCorruptFile() + { + if (!getCountClusters()) + log_warn("no clusters found"); + else + { + offset_t lastOffset = getClusterOffset(cluster_index_t(cluster_index_type(getCountClusters()) - 1)); + log_debug("last offset=" << lastOffset.v << " file size=" << getFilesize().v); + if (lastOffset.v > getFilesize().v) + { + log_fatal("last offset (" << lastOffset << ") larger than file size (" << getFilesize() << ')'); + throw ZimFileFormatError("last cluster offset larger than file size; file corrupt"); + } + } + + if (header.hasChecksum() && header.getChecksumPos() != (getFilesize().v-16) ) { + throw ZimFileFormatError("Checksum position is not valid"); + } + } + + offset_type FileImpl::getMimeListEndUpperLimit() const + { + offset_type result(header.getUrlPtrPos()); + result = std::min(result, header.getTitleIdxPos()); + result = std::min(result, header.getClusterPtrPos()); + if ( getCountArticles().v != 0 ) { + // assuming that dirents are placed in the zim file in the same + // order as the corresponding entries in the dirent pointer table + result = std::min(result, mp_urlDirentAccessor->getOffset(entry_index_t(0)).v); + + // assuming that clusters are placed in the zim file in the same + // order as the corresponding entries in the cluster pointer table + result = std::min(result, readOffset(*clusterOffsetReader, 0).v); + } + return result; + } + + void FileImpl::readMimeTypes() + { + // read mime types + // libzim write zims files two ways : + // - The old way by putting the urlPtrPos just after the mimetype. + // - The new way by putting the urlPtrPos at the end of the zim files. + // In this case, the cluster data are always at 1024 bytes offset and we know that + // mimetype list is before this. + // 1024 seems to be a good maximum size for the mimetype list, even for the "old" way. + const auto endMimeList = getMimeListEndUpperLimit(); + if ( endMimeList <= header.getMimeListPos() ) { + throw(ZimFileFormatError("Bad ZIM archive")); + } + const zsize_t size(endMimeList - header.getMimeListPos()); + if ( endMimeList > 1024 ) { + log_warn("The MIME-type list is abnormally large (" << size.v << " bytes)"); + } + auto buffer = zimReader->get_buffer(offset_t(header.getMimeListPos()), size); + const char* const bufferEnd = buffer.data() + size.v; + const char* p = buffer.data(); + while (*p != '\0') { + const char* zp = std::find(p, bufferEnd, '\0'); + + if (zp == bufferEnd) { + throw(ZimFileFormatError("Error getting mimelists.")); + } + + std::string mimeType(p, zp); + mimeTypes.push_back(mimeType); + + p = zp+1; + } + + const_cast(m_newNamespaceScheme) = header.getMinorVersion() >= 1; + if (m_newNamespaceScheme) { + const_cast(m_startUserEntry) = getNamespaceBeginOffset('C'); + const_cast(m_endUserEntry) = getNamespaceEndOffset('C'); + } else { + const_cast(m_endUserEntry) = getCountArticles(); + } + } + + FileImpl::FindxResult FileImpl::findx(char ns, const std::string& url) + { + return direntLookup().find(ns, url); + } + + FileImpl::FindxResult FileImpl::findx(const std::string& url) + { + char ns; + std::string path; + try { + std::tie(ns, path) = parseLongPath(url); + return findx(ns, path); + } catch (...) {} + return { false, entry_index_t(0) }; + } + + static inline int direntCompareTitle(char ns, const std::string& title, const Dirent& dirent) + { + auto direntNs = dirent.getNamespace(); + if (ns < direntNs) { + return -1; + } + if (ns > direntNs) { + return 1; + } + return title.compare(dirent.getTitle()); + } + + FileImpl::FindxTitleResult FileImpl::findxByTitle(char ns, const std::string& title) + { + return m_byTitleDirentLookup->find(ns, title); + } + + FileCompound::PartRange + FileImpl::getFileParts(offset_t offset, zsize_t size) + { + return zimFile->locate(offset, size); + } + + std::shared_ptr FileImpl::getDirent(entry_index_t idx) + { + return mp_urlDirentAccessor->getDirent(idx); + } + + std::shared_ptr FileImpl::getDirentByTitle(title_index_t idx) + { + return mp_titleDirentAccessor->getDirent(idx); + } + + entry_index_t FileImpl::getIndexByTitle(title_index_t idx) const + { + return mp_titleDirentAccessor->getDirectIndex(idx); + } + + entry_index_t FileImpl::getFrontEntryCount() const + { + return entry_index_t(mp_titleDirentAccessor->getDirentCount().v); + } + + void FileImpl::prepareArticleListByCluster() const + { + m_articleListByCluster.reserve(getUserEntryCount().v); + + auto endIdx = getEndUserEntry().v; + for(auto i = getStartUserEntry().v; i < endIdx; i++) + { + // This is the offset of the dirent in the zimFile + auto indexOffset = mp_urlDirentAccessor->getOffset(entry_index_t(i)); + // Get the mimeType of the dirent (offset 0) to know the type of the dirent + uint16_t mimeType = zimReader->read_uint(indexOffset); + if (mimeType==Dirent::redirectMimeType || mimeType==Dirent::linktargetMimeType || mimeType == Dirent::deletedMimeType) { + m_articleListByCluster.push_back(std::make_pair(0, i)); + } else { + // If it is a classic article, get the clusterNumber (at offset 8) + auto clusterNumber = zimReader->read_uint(indexOffset+offset_t(8)); + m_articleListByCluster.push_back(std::make_pair(clusterNumber, i)); + } + } + std::sort(m_articleListByCluster.begin(), m_articleListByCluster.end()); + } + + entry_index_t FileImpl::getIndexByClusterOrder(entry_index_t idx) const + { + // Not using std::call_once because it is buggy. See the comment + // in FileImpl::direntLookup(). + if ( m_articleListByCluster.empty() ) { + std::lock_guard lock(m_articleListByClusterMutex); + if ( m_articleListByCluster.empty() ) { + prepareArticleListByCluster(); + } + } + if (idx.v >= m_articleListByCluster.size()) + throw std::out_of_range("entry index out of range"); + return entry_index_t(m_articleListByCluster[idx.v].second); + } + + FileImpl::ClusterHandle FileImpl::readCluster(cluster_index_t idx) + { + offset_t clusterOffset(getClusterOffset(idx)); + log_debug("read cluster " << idx << " from offset " << clusterOffset); + return Cluster::read(*zimReader, clusterOffset); + } + + std::shared_ptr FileImpl::getCluster(cluster_index_t idx) + { + if (idx >= getCountClusters()) + throw ZimFileFormatError("cluster index out of range"); + + auto cluster = clusterCache.getOrPut(idx.v, [=](){ return readCluster(idx); }); +#if ENV32BIT + // There was a bug in the way we create the zim files using ZSTD compression. + // We were using a too hight compression level and so a window of 128Mb. + // So at decompression, zstd reserve a 128Mb buffer. + // While this memory is not really used (thanks to lazy allocation of OS), + // we are still consumming address space. On 32bits this start to be a rare + // ressource when we reserved 128Mb at once. + // So we drop the cluster from the cache to avoid future memory allocation error. + if (cluster->getCompression() == Cluster::Compression::Zstd) { + // ZSTD compression starts to be used on version 5.0 of zim format. + // Recently after, we switch to 5.1 and itegrate the fix in zstd creation. + // 5.0 is not a perfect way to detect faulty zim file (it will generate false + // positives) but it should be enough. + if (header.getMajorVersion() == 5 && header.getMinorVersion() == 0) { + clusterCache.drop(idx.v); + } + } +#endif + return cluster; + } + + offset_t FileImpl::getClusterOffset(cluster_index_t idx) const + { + return readOffset(*clusterOffsetReader, idx.v); + } + + offset_t FileImpl::getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx) + { + auto cluster = getCluster(clusterIdx); + if (cluster->isCompressed()) + return offset_t(0); + return getClusterOffset(clusterIdx) + cluster->getBlobOffset(blobIdx); + } + + entry_index_t FileImpl::getNamespaceBeginOffset(char ch) const + { + log_trace("getNamespaceBeginOffset(" << ch << ')'); + return direntLookup().getNamespaceRangeBegin(ch); + } + + entry_index_t FileImpl::getNamespaceEndOffset(char ch) const + { + log_trace("getNamespaceEndOffset(" << ch << ')'); + return direntLookup().getNamespaceRangeEnd(ch); + } + + const std::string& FileImpl::getMimeType(uint16_t idx) const + { + if (idx >= mimeTypes.size()) + { + std::ostringstream msg; + msg << "unknown mime type code " << idx; + throw ZimFileFormatError(msg.str()); + } + + return mimeTypes[idx]; + } + + std::string FileImpl::getChecksum() + { + if (!header.hasChecksum()) + return std::string(); + + try { + auto chksum = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); + + char hexdigest[33]; + hexdigest[32] = '\0'; + static const char hex[] = "0123456789abcdef"; + char* p = hexdigest; + for (int i = 0; i < 16; ++i) + { + uint8_t v = chksum.at(offset_t(i)); + *p++ = hex[v >> 4]; + *p++ = hex[v & 0xf]; + } + log_debug("chksum=" << hexdigest); + return hexdigest; + } catch (...) + { + log_warn("error reading checksum"); + return std::string(); + } + } + + bool FileImpl::verify() + { + if (!header.hasChecksum()) + return false; + + struct zim_MD5_CTX md5ctx; + zim_MD5Init(&md5ctx); + + offset_type checksumPos = header.getChecksumPos(); + offset_type currentPos = 0; + for(auto part = zimFile->begin(); + part != zimFile->end(); + part++) { + std::ifstream stream(part->second->filename(), std::ios_base::in|std::ios_base::binary); + + char ch; + for(/*NOTHING*/ ; currentPos < checksumPos && stream.get(ch).good(); currentPos++) { + zim_MD5Update(&md5ctx, reinterpret_cast(&ch), 1); + } + if (stream.bad()) { + perror("error while reading file"); + return false; + } + if (currentPos == checksumPos) { + break; + } + } + + if (currentPos != checksumPos) { + return false; + } + + unsigned char chksumCalc[16]; + auto chksumFile = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16)); + + zim_MD5Final(chksumCalc, &md5ctx); + if (std::memcmp(chksumFile.data(), chksumCalc, 16) != 0) + { + return false; + } + + return true; + } + + time_t FileImpl::getMTime() const { + return zimFile->getMTime(); + } + + zim::zsize_t FileImpl::getFilesize() const { + return zimReader->size(); + } + + bool FileImpl::is_multiPart() const { + return zimFile->is_multiPart(); + } + + bool FileImpl::checkIntegrity(IntegrityCheck checkType) { + switch(checkType) { + case IntegrityCheck::CHECKSUM: return FileImpl::checkChecksum(); + case IntegrityCheck::DIRENT_PTRS: return FileImpl::checkDirentPtrs(); + case IntegrityCheck::DIRENT_ORDER: return FileImpl::checkDirentOrder(); + case IntegrityCheck::TITLE_INDEX: return FileImpl::checkTitleIndex(); + case IntegrityCheck::CLUSTER_PTRS: return FileImpl::checkClusterPtrs(); + case IntegrityCheck::DIRENT_MIMETYPES: return FileImpl::checkDirentMimeTypes(); + case IntegrityCheck::COUNT: ASSERT("shouldn't have reached here", ==, ""); + } + return false; + } + + bool FileImpl::checkChecksum() { + if ( ! verify() ) { + std::cerr << "Checksum doesn't match" << std::endl; + return false; + } + return true; + } + + bool FileImpl::checkDirentPtrs() { + const entry_index_type articleCount = getCountArticles().v; + const offset_t validDirentRangeStart(80); // XXX: really??? + const offset_t validDirentRangeEnd = header.hasChecksum() + ? offset_t(header.getChecksumPos()) + : offset_t(zimReader->size().v); + const zsize_t direntMinSize(11); + for ( entry_index_type i = 0; i < articleCount; ++i ) + { + const auto offset = mp_urlDirentAccessor->getOffset(entry_index_t(i)); + if ( offset < validDirentRangeStart || + offset + direntMinSize > validDirentRangeEnd ) { + std::cerr << "Invalid dirent pointer" << std::endl; + return false; + } + } + return true; + } + + bool FileImpl::checkDirentOrder() { + const entry_index_type articleCount = getCountArticles().v; + std::shared_ptr prevDirent; + for ( entry_index_type i = 0; i < articleCount; ++i ) + { + const std::shared_ptr dirent = mp_urlDirentAccessor->getDirent(entry_index_t(i)); + if ( prevDirent && !(prevDirent->getLongUrl() < dirent->getLongUrl()) ) + { + std::cerr << "Dirent table is not properly sorted:\n" + << " #" << i-1 << ": " << prevDirent->getLongUrl() << "\n" + << " #" << i << ": " << dirent->getLongUrl() << std::endl; + return false; + } + prevDirent = dirent; + } + return true; + } + + bool FileImpl::checkClusterPtrs() { + const cluster_index_type clusterCount = getCountClusters().v; + const offset_t validClusterRangeStart(80); // XXX: really??? + const offset_t validClusterRangeEnd = header.hasChecksum() + ? offset_t(header.getChecksumPos()) + : offset_t(zimReader->size().v); + const zsize_t clusterMinSize(1); // XXX + for ( cluster_index_type i = 0; i < clusterCount; ++i ) + { + const auto offset = readOffset(*clusterOffsetReader, i); + if ( offset < validClusterRangeStart || + offset + clusterMinSize > validClusterRangeEnd ) { + std::cerr << "Invalid cluster pointer" << std::endl; + return false; + } + } + return true; + } + +namespace +{ + +std::string pseudoTitle(const Dirent& d) +{ + return std::string(1, d.getNamespace()) + '/' + d.getTitle(); +} + +bool checkTitleListing(const IndirectDirentAccessor& accessor, entry_index_type totalCount) { + const entry_index_type direntCount = accessor.getDirentCount().v; + std::shared_ptr prevDirent; + for ( entry_index_type i = 0; i < direntCount; ++i ) { + if (accessor.getDirectIndex(title_index_t(i)).v >= totalCount) { + std::cerr << "Invalid title index entry." << std::endl; + return false; + } + + const std::shared_ptr dirent = accessor.getDirent(title_index_t(i)); + if ( prevDirent && !(pseudoTitle(*prevDirent) <= pseudoTitle(*dirent)) ) { + std::cerr << "Title index is not properly sorted." << std::endl; + return false; + } + prevDirent = dirent; + } + return true; +} + +} // unnamed namespace + + bool FileImpl::checkTitleIndex() { + const entry_index_type articleCount = getCountArticles().v; + + offset_t titleOffset(header.getTitleIdxPos()); + zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount()); + auto titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Full Title index table"); + auto ret = checkTitleListing(*titleDirentAccessor, articleCount); + + titleDirentAccessor = getTitleAccessor("listing/titleOrdered/v1"); + if (titleDirentAccessor) { + ret &= checkTitleListing(*titleDirentAccessor, articleCount); + } + return ret; + } + + bool FileImpl::checkDirentMimeTypes() { + const entry_index_type articleCount = getCountArticles().v; + for ( entry_index_type i = 0; i < articleCount; ++i ) + { + const auto dirent = mp_urlDirentAccessor->getDirent(entry_index_t(i)); + if ( dirent->isArticle() && dirent->getMimeType() >= mimeTypes.size() ) { + std::cerr << "Entry " << dirent->getLongUrl() + << " has invalid MIME-type value " << dirent->getMimeType() + << "." << std::endl; + return false; + } + } + return true; + } + +} diff --git a/src/fileimpl.h b/src/fileimpl.h new file mode 100644 index 0000000..4e1f7d7 --- /dev/null +++ b/src/fileimpl.h @@ -0,0 +1,182 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2020-2021 Veloman Yunkan + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FILEIMPL_H +#define ZIM_FILEIMPL_H + +#include +#include +#include +#include +#include +#include +#include "lrucache.h" +#include "concurrent_cache.h" +#include "_dirent.h" +#include "dirent_accessor.h" +#include "dirent_lookup.h" +#include "cluster.h" +#include "buffer.h" +#include "file_reader.h" +#include "file_compound.h" +#include "fileheader.h" +#include "zim_types.h" +#include "direntreader.h" + + +namespace zim +{ + class FileImpl + { + std::shared_ptr zimFile; + offset_t archiveStartOffset; + std::shared_ptr zimReader; + std::shared_ptr direntReader; + Fileheader header; + + std::unique_ptr clusterOffsetReader; + + std::shared_ptr mp_urlDirentAccessor; + std::unique_ptr mp_titleDirentAccessor; + + typedef std::shared_ptr ClusterHandle; + ConcurrentCache clusterCache; + + const bool m_newNamespaceScheme; + const bool m_hasFrontArticlesIndex; + const entry_index_t m_startUserEntry; + const entry_index_t m_endUserEntry; + + typedef std::vector MimeTypes; + MimeTypes mimeTypes; + + using pair_type = std::pair; + mutable std::vector m_articleListByCluster; + mutable std::mutex m_articleListByClusterMutex; + + struct DirentLookupConfig + { + typedef DirectDirentAccessor DirentAccessorType; + typedef entry_index_t index_t; + static const std::string& getDirentKey(const Dirent& d) { + return d.getUrl(); + } + }; + + using DirentLookup = zim::FastDirentLookup; + mutable std::unique_ptr m_direntLookup; + mutable std::mutex m_direntLookupCreationMutex; + + + struct ByTitleDirentLookupConfig + { + typedef IndirectDirentAccessor DirentAccessorType; + typedef title_index_t index_t; + static const std::string& getDirentKey(const Dirent& d) { + return d.getTitle(); + } + }; + + using ByTitleDirentLookup = zim::DirentLookup; + std::unique_ptr m_byTitleDirentLookup; + + public: + using FindxResult = std::pair; + using FindxTitleResult = std::pair; + + explicit FileImpl(const std::string& fname); +#ifndef _WIN32 + explicit FileImpl(int fd); + FileImpl(int fd, offset_t offset, zsize_t size); +#endif + + offset_t getArchiveStartOffset() const { return archiveStartOffset; } + time_t getMTime() const; + + const std::string& getFilename() const { return zimFile->filename(); } + const Fileheader& getFileheader() const { return header; } + zsize_t getFilesize() const; + bool hasNewNamespaceScheme() const { return m_newNamespaceScheme; } + bool hasFrontArticlesIndex() const { return m_hasFrontArticlesIndex; } + + FileCompound::PartRange getFileParts(offset_t offset, zsize_t size); + std::shared_ptr getDirent(entry_index_t idx); + std::shared_ptr getDirentByTitle(title_index_t idx); + entry_index_t getIndexByTitle(title_index_t idx) const; + entry_index_t getIndexByClusterOrder(entry_index_t idx) const; + entry_index_t getCountArticles() const { return entry_index_t(header.getArticleCount()); } + + FindxResult findx(char ns, const std::string& url); + FindxResult findx(const std::string& url); + FindxTitleResult findxByTitle(char ns, const std::string& title); + + std::shared_ptr getCluster(cluster_index_t idx); + cluster_index_t getCountClusters() const { return cluster_index_t(header.getClusterCount()); } + offset_t getClusterOffset(cluster_index_t idx) const; + offset_t getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx); + + entry_index_t getNamespaceBeginOffset(char ch) const; + entry_index_t getNamespaceEndOffset(char ch) const; + entry_index_t getNamespaceEntryCount(char ch) const { + return getNamespaceEndOffset(ch) - getNamespaceBeginOffset(ch); + } + + entry_index_t getStartUserEntry() const { return m_startUserEntry; } + entry_index_t getEndUserEntry() const { return m_endUserEntry; } + // The number of entries added by the creator. (So excluding index, ...). + // On new namespace scheme, number of entries in C namespace + entry_index_t getUserEntryCount() const { return m_endUserEntry - m_startUserEntry; } + // The number of enties that can be considered as front article (no resource) + entry_index_t getFrontEntryCount() const; + + const std::string& getMimeType(uint16_t idx) const; + + std::string getChecksum(); + bool verify(); + bool is_multiPart() const; + + bool checkIntegrity(IntegrityCheck checkType); + private: + explicit FileImpl(std::shared_ptr zimFile); + FileImpl(std::shared_ptr zimFile, offset_t offset, zsize_t size); + + std::unique_ptr getTitleAccessor(const std::string& path); + std::unique_ptr getTitleAccessor(const offset_t offset, const zsize_t size, const std::string& name); + + void prepareArticleListByCluster() const; + DirentLookup& direntLookup() const; + ClusterHandle readCluster(cluster_index_t idx); + offset_type getMimeListEndUpperLimit() const; + void readMimeTypes(); + void quickCheckForCorruptFile(); + + bool checkChecksum(); + bool checkDirentPtrs(); + bool checkDirentOrder(); + bool checkTitleIndex(); + bool checkClusterPtrs(); + bool checkDirentMimeTypes(); + }; + +} + +#endif // ZIM_FILEIMPL_H + diff --git a/src/fs.h b/src/fs.h new file mode 100644 index 0000000..5736a5e --- /dev/null +++ b/src/fs.h @@ -0,0 +1,38 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_H_ +#define ZIM_FS_H_ + +#ifdef _WIN32 +# include "fs_windows.h" +#else +# include "fs_unix.h" +#endif + +namespace zim { + +#ifdef _WIN32 +using DEFAULTFS = windows::FS; +#else +using DEFAULTFS = unix::FS; +#endif +}; + +#endif //ZIM_FS_H_ diff --git a/src/fs_unix.cpp b/src/fs_unix.cpp new file mode 100644 index 0000000..e5f404f --- /dev/null +++ b/src/fs_unix.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fs_unix.h" +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace zim +{ + +namespace unix { + +zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const +{ +#if defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__) +# define PREAD pread +#else +# define PREAD pread64 +#endif + ssize_t full_size_read = 0; + auto size_to_read = size.v; + auto current_offset = offset.v; + errno = 0; + while (size_to_read > 0) { + auto size_read = PREAD(m_fd, dest, size_to_read, current_offset); + if (size_read == -1) { + return zsize_t(-1); + } + size_to_read -= size_read; + current_offset += size_read; + full_size_read += size_read; + } + return zsize_t(full_size_read); +#undef PREAD +} + +zsize_t FD::getSize() const +{ + struct stat sb; + fstat(m_fd, &sb); + return zsize_t(sb.st_size); +} + +bool FD::seek(offset_t offset) +{ + return static_cast(offset.v) == lseek(m_fd, offset.v, SEEK_SET); +} + +bool FD::close() { + if (m_fd != -1) { + return ::close(m_fd); + } + return -1; +} + +FD FS::openFile(path_t filepath) +{ + int fd = open(filepath.c_str(), O_RDONLY); + if (fd == -1) { + const std::string errorStr = strerror(errno); + throw std::runtime_error("Error opening file: " + filepath + ": " + errorStr); + } + return FD(fd); +} + +bool FS::makeDirectory(path_t path) +{ + return !mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH); +} + +void FS::rename(path_t old_path, path_t new_path) +{ + ::rename(old_path.c_str(), new_path.c_str()); +} + +std::string FS::join(path_t base, path_t name) +{ + return base + "/" + name; +} + +bool FS::remove(path_t path) +{ + DIR* dir; + /* It's a directory, remove all its entries first */ + if ((dir = opendir(path.c_str())) != NULL) { + struct dirent* ent; + while ((ent = readdir(dir)) != NULL) { + std::string childName = ent->d_name; + if (childName != "." && childName != "..") { + auto childPath = join(path, childName); + remove(childPath); + } + } + closedir(dir); + return removeDir(path); + } + + /* It's a file */ + else { + return removeFile(path); + } +} + +bool FS::removeDir(path_t path) { + return rmdir(path.c_str()); +} + +bool FS::removeFile(path_t path) { + return ::remove(path.c_str()); +} + + +}; // unix namespace + +std::string getFilePathFromFD(int fd) +{ + std::ostringstream oss; + oss << "/dev/fd/" << fd; + + return oss.str(); +} + +}; // zim namespace + diff --git a/src/fs_unix.h b/src/fs_unix.h new file mode 100644 index 0000000..51aab05 --- /dev/null +++ b/src/fs_unix.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_UNIX_H_ +#define ZIM_FS_UNIX_H_ + +#include "zim_types.h" + +#include + +#include +#include +#include +#include +#include + +namespace zim { + +namespace unix { + +using path_t = const std::string&; + +class FD { + public: + using fd_t = int; + + private: + fd_t m_fd = -1; + + public: + FD() = default; + FD(fd_t fd): + m_fd(fd) {}; + FD(const FD& o) = delete; + FD(FD&& o) : + m_fd(o.m_fd) { o.m_fd = -1; } + FD& operator=(FD&& o) { + m_fd = o.m_fd; + o.m_fd = -1; + return *this; + } + ~FD() { close(); } + zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; + zsize_t getSize() const; + fd_t getNativeHandle() const + { + return m_fd; + } + fd_t release() + { + int ret = m_fd; + m_fd = -1; + return ret; + } + bool seek(offset_t offset); + bool close(); +}; + +struct FS { + using FD = zim::unix::FD; + static std::string join(path_t base, path_t name); + static FD openFile(path_t filepath); + static bool makeDirectory(path_t path); + static void rename(path_t old_path, path_t new_path); + static bool remove(path_t path); + static bool removeDir(path_t path); + static bool removeFile(path_t path); +}; + +}; // unix namespace + +std::string getFilePathFromFD(int fd); + +}; // zim namespace + +#endif //ZIM_FS_UNIX_H_ diff --git a/src/fs_windows.cpp b/src/fs_windows.cpp new file mode 100644 index 0000000..4fe5684 --- /dev/null +++ b/src/fs_windows.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "fs_windows.h" +#include + +#include +#include +#include +#include +#include + +#include +#include + +namespace zim { + +namespace windows { + +struct ImplFD { + HANDLE m_handle = INVALID_HANDLE_VALUE; + CRITICAL_SECTION m_criticalSection; + + ImplFD() { + InitializeCriticalSection(&m_criticalSection); + } + ImplFD(HANDLE handle) : + m_handle(handle) + { + InitializeCriticalSection(&m_criticalSection); + } + + ~ImplFD() { + DeleteCriticalSection(&m_criticalSection); + } +}; + +FD::FD() : + mp_impl(new ImplFD()) {} + +FD::FD(fd_t handle) : + mp_impl(new ImplFD(handle)) {} + +FD::FD(FD&& o) = default; +FD& FD::operator=(FD&& o) = default; + +FD::~FD() +{ + if (mp_impl) + close(); +} + +zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const +{ + if (!mp_impl) + return zsize_t(-1); + EnterCriticalSection(&mp_impl->m_criticalSection); + LARGE_INTEGER off; + off.QuadPart = offset.v; + if (!SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN)) { + goto err; + } + + DWORD size_read; + if (!ReadFile(mp_impl->m_handle, dest, size.v, &size_read, NULL)) { + goto err; + } + if (size_read != size.v) { + goto err; + } + LeaveCriticalSection(&mp_impl->m_criticalSection); + return size; +err: + LeaveCriticalSection(&mp_impl->m_criticalSection); + return zsize_t(-1); +} + +bool FD::seek(offset_t offset) +{ + if(!mp_impl) + return false; + LARGE_INTEGER off; + off.QuadPart = offset.v; + return SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN); +} + +zsize_t FD::getSize() const +{ + if(!mp_impl) + return zsize_t(0); + LARGE_INTEGER size; + if (!GetFileSizeEx(mp_impl->m_handle, &size)) { + size.QuadPart = 0; + } + return zsize_t(size.QuadPart); +} + +int FD::release() +{ + if(!mp_impl) + return -1; + int ret = _open_osfhandle(reinterpret_cast(mp_impl->m_handle), 0); + mp_impl->m_handle = INVALID_HANDLE_VALUE; + return ret; +} + +bool FD::close() +{ + if (!mp_impl || mp_impl->m_handle == INVALID_HANDLE_VALUE) { + return false; + } + return CloseHandle(mp_impl->m_handle); +} + +std::unique_ptr FS::toWideChar(path_t path) +{ + auto size = MultiByteToWideChar(CP_UTF8, 0, + path.c_str(), -1, nullptr, 0); + auto wdata = std::unique_ptr(new wchar_t[size]); + auto ret = MultiByteToWideChar(CP_UTF8, 0, + path.c_str(), -1, wdata.get(), size); + if (0 == ret) { + std::ostringstream oss; + oss << "Cannot convert path to wchar : " << GetLastError(); + throw std::runtime_error(oss.str()); + } + return wdata; +} + +FD FS::openFile(path_t filepath) +{ + auto wpath = toWideChar(filepath); + FD::fd_t handle; + handle = CreateFileW(wpath.get(), + GENERIC_READ, + FILE_SHARE_READ, + NULL, + OPEN_EXISTING, + FILE_ATTRIBUTE_READONLY|FILE_FLAG_RANDOM_ACCESS, + NULL); + if (handle == INVALID_HANDLE_VALUE) { + std::ostringstream oss; + oss << "Cannot open file : " << GetLastError(); + throw std::runtime_error(oss.str()); + } + return FD(handle); +} + +bool FS::makeDirectory(path_t path) +{ + auto wpath = toWideChar(path); + auto ret = CreateDirectoryW(wpath.get(), NULL); + return ret; +} + + +void FS::rename(path_t old_path, path_t new_path) +{ + auto ret = MoveFileExW(toWideChar(old_path).get(), toWideChar(new_path).get(), MOVEFILE_REPLACE_EXISTING|MOVEFILE_WRITE_THROUGH); + if (!ret) { + std::ostringstream oss; + oss << "Cannot move file " << old_path << " to " << new_path; + throw std::runtime_error(oss.str()); + } +} + +std::string FS::join(path_t base, path_t name) +{ + return base + "\\" + name; +} + +bool FS::removeDir(path_t path) +{ + return RemoveDirectoryW(toWideChar(path).get()); +} + +bool FS::removeFile(path_t path) +{ + return DeleteFileW(toWideChar(path).get()); +} + +}; // windows namespace + +}; // zim namespace + diff --git a/src/fs_windows.h b/src/fs_windows.h new file mode 100644 index 0000000..9e4ae07 --- /dev/null +++ b/src/fs_windows.h @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2018 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_FS_WINDOWS_H_ +#define ZIM_FS_WINDOWS_H_ + +#include "zim_types.h" + +#include +#include + +typedef void* HANDLE; + +namespace zim { + +namespace windows { + +using path_t = const std::string&; + +struct ImplFD; + +class FD { + public: + typedef HANDLE fd_t; + private: + std::unique_ptr mp_impl; + + public: + FD(); + FD(fd_t handle); + FD(const FD& o) = delete; + FD(FD&& o); + FD& operator=(FD&& o); + FD& operator=(const FD& o) = delete; + ~FD(); + zsize_t readAt(char* dest, zsize_t size, offset_t offset) const; + zsize_t getSize() const; + int release(); + bool seek(offset_t offset); + bool close(); +}; + +struct FS { + using FD = zim::windows::FD; + static std::string join(path_t base, path_t name); + static std::unique_ptr toWideChar(path_t path); + static FD openFile(path_t filepath); + static bool makeDirectory(path_t path); + static void rename(path_t old_path, path_t new_path); + static bool remove(path_t path); + static bool removeDir(path_t path); + static bool removeFile(path_t path); +}; + +}; // windows namespace + +}; // zim namespace + +#endif //ZIM_FS_WINDOWS_H_ diff --git a/src/istreamreader.cpp b/src/istreamreader.cpp new file mode 100644 index 0000000..9ac4830 --- /dev/null +++ b/src/istreamreader.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "istreamreader.h" +#include "buffer_reader.h" + +namespace zim +{ + +//////////////////////////////////////////////////////////////////////////////// +// IDataStream +//////////////////////////////////////////////////////////////////////////////// + +std::unique_ptr +IStreamReader::sub_reader(zsize_t size) +{ + auto buffer = Buffer::makeBuffer(size); + readImpl(const_cast(buffer.data()), size); + return std::unique_ptr(new BufferReader(buffer)); +} + +} // namespace zim diff --git a/src/istreamreader.h b/src/istreamreader.h new file mode 100644 index 0000000..4255d3f --- /dev/null +++ b/src/istreamreader.h @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_IDATASTREAM_H +#define ZIM_IDATASTREAM_H + +#include +#include + +#include "endian_tools.h" +#include "reader.h" + +namespace zim +{ + +// IDataStream is a simple interface for sequential iteration over a stream +// of values of built-in/primitive types and/or opaque binary objects (blobs). +// An example usage: +// +// void foo(IDataStream& s) +// { +// const uint32_t n = s.read(); +// for(uint32_t i=0; i < n; ++i) +// { +// const uint16_t blobSize = s.read(); +// IDataStream::Blob blob = s.readBlob(blobSize); +// bar(blob, blobSize); +// } +// } +// +class IStreamReader +{ +public: // functions + virtual ~IStreamReader() = default; + + // Reads a value of the said type from the stream + // + // For best portability this function should be used with types of known + // bit-width (int32_t, uint16_t, etc) rather than builtin types with + // unknown bit-width (int, unsigned, etc). + template T read(); + + // Reads a blob of the specified size from the stream + virtual std::unique_ptr sub_reader(zsize_t size); + +private: // virtual methods + // Reads exactly 'nbytes' bytes into the provided buffer 'buf' + // (which must be at least that big). Throws an exception if + // more bytes are requested than can be retrieved. + virtual void readImpl(char* buf, zsize_t nbytes) = 0; +}; + +//////////////////////////////////////////////////////////////////////////////// +// Implementation of IDataStream +//////////////////////////////////////////////////////////////////////////////// + +// XXX: Assuming that opaque binary data retrieved via 'readImpl()' +// XXX: is encoded in little-endian form. +template +inline T +IStreamReader::read() +{ + constexpr size_type N(sizeof(T)); + char buf[N]; + readImpl(buf, zsize_t(N)); + return fromLittleEndian(buf); // XXX: This handles only integral types +} + +} // namespace zim + +#endif // ZIM_IDATASTREAM_H diff --git a/src/item.cpp b/src/item.cpp new file mode 100644 index 0000000..19a95e9 --- /dev/null +++ b/src/item.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (C) 2021 Veloman Yunkan + * Copyright (C) 2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#define ZIM_PRIVATE +#include +#include "_dirent.h" +#include "cluster.h" +#include "fileimpl.h" +#include "file_part.h" +#include "log.h" + +log_define("zim.item") + +using namespace zim; + +Item::Item(std::shared_ptr file, entry_index_type idx) + : m_file(file), + m_idx(idx), + m_dirent(file->getDirent(entry_index_t(idx))) +{} + +std::string Item::getTitle() const +{ + return m_dirent->getTitle(); +} + +std::string Item::getPath() const +{ + if (m_file->hasNewNamespaceScheme()) { + return m_dirent->getUrl(); + } else { + return m_dirent->getLongUrl(); + } +} + +std::string Item::getMimetype() const +{ + return m_file->getMimeType(m_dirent->getMimeType()); +} + +Blob Item::getData(offset_type offset) const +{ + auto size = getSize()-offset; + return getData(offset, size); +} + +Blob Item::getData(offset_type offset, size_type size) const +{ + auto cluster = m_file->getCluster(m_dirent->getClusterNumber()); + return cluster->getBlob(m_dirent->getBlobNumber(), + offset_t(offset), + zsize_t(size)); +} + +size_type Item::getSize() const +{ + auto cluster = m_file->getCluster(m_dirent->getClusterNumber()); + return size_type(cluster->getBlobSize(m_dirent->getBlobNumber())); +} + +std::pair Item::getDirectAccessInformation() const +{ + auto cluster = m_file->getCluster(m_dirent->getClusterNumber()); + if (cluster->isCompressed()) { + return std::make_pair("", 0); + } + + auto full_offset = m_file->getBlobOffset(m_dirent->getClusterNumber(), + m_dirent->getBlobNumber()); + + full_offset += m_file->getArchiveStartOffset().v; + + auto part_its = m_file->getFileParts(full_offset, zsize_t(getSize())); + auto first_part = part_its.first; + if (++part_its.first != part_its.second) { + // The content is split on two parts. We cannot have direct access + return std::make_pair("", 0); + } + auto range = first_part->first; + auto part = first_part->second; + const offset_type local_offset(full_offset - range.min); + return std::make_pair(part->filename(), local_offset); +} + +cluster_index_type Item::getClusterIndex() const +{ + return m_dirent->getClusterNumber().v; +} diff --git a/src/log.h b/src/log.h new file mode 100644 index 0000000..5fbd81a --- /dev/null +++ b/src/log.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "config.h" + +#ifdef WITH_CXXTOOLS + +#include + +#else + +#define log_define(e) +#define log_fatal(e) +#define log_error(e) +#define log_warn(e) +#define log_info(e) +#define log_debug(e) +#define log_trace(e) +#define log_init() + +#endif diff --git a/src/lrucache.h b/src/lrucache.h new file mode 100644 index 0000000..3389446 --- /dev/null +++ b/src/lrucache.h @@ -0,0 +1,160 @@ +/* + * Copyrigth (c) 2021, Matthieu Gautier + * Copyright (c) 2020, Veloman Yunkan + * Copyright (c) 2014, lamerman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * * Neither the name of lamerman nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * File: lrucache.hpp + * Author: Alexander Ponomarev + * + * Created on June 20, 2013, 5:09 PM + */ + +#ifndef _LRUCACHE_HPP_INCLUDED_ +#define _LRUCACHE_HPP_INCLUDED_ + +#include +#include +#include +#include +#include + +namespace zim { + +template +class lru_cache { +public: // types + typedef typename std::pair key_value_pair_t; + typedef typename std::list::iterator list_iterator_t; + + enum AccessStatus { + HIT, // key was found in the cache + PUT, // key was not in the cache but was created by the getOrPut() access + MISS // key was not in the cache; get() access failed + }; + + class AccessResult + { + const AccessStatus status_; + const value_t val_; + public: + AccessResult(const value_t& val, AccessStatus status) + : status_(status), val_(val) + {} + AccessResult() : status_(MISS), val_() {} + + bool hit() const { return status_ == HIT; } + bool miss() const { return !hit(); } + const value_t& value() const + { + if ( status_ == MISS ) + throw std::range_error("There is no such key in cache"); + return val_; + } + + operator const value_t& () const { return value(); } + }; + +public: // functions + explicit lru_cache(size_t max_size) : + _max_size(max_size) { + } + + // If 'key' is present in the cache, returns the associated value, + // otherwise puts the given value into the cache (and returns it with + // a status of a cache miss). + AccessResult getOrPut(const key_t& key, const value_t& value) { + auto it = _cache_items_map.find(key); + if (it != _cache_items_map.end()) { + _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); + return AccessResult(it->second->second, HIT); + } else { + putMissing(key, value); + return AccessResult(value, PUT); + } + } + + void put(const key_t& key, const value_t& value) { + auto it = _cache_items_map.find(key); + if (it != _cache_items_map.end()) { + _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); + it->second->second = value; + } else { + putMissing(key, value); + } + } + + AccessResult get(const key_t& key) { + auto it = _cache_items_map.find(key); + if (it == _cache_items_map.end()) { + return AccessResult(); + } else { + _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second); + return AccessResult(it->second->second, HIT); + } + } + + bool drop(const key_t& key) { + try { + auto list_it = _cache_items_map.at(key); + _cache_items_list.erase(list_it); + _cache_items_map.erase(key); + return true; + } catch (std::out_of_range& e) { + return false; + } + } + + bool exists(const key_t& key) const { + return _cache_items_map.find(key) != _cache_items_map.end(); + } + + size_t size() const { + return _cache_items_map.size(); + } + +private: // functions + void putMissing(const key_t& key, const value_t& value) { + assert(_cache_items_map.find(key) == _cache_items_map.end()); + _cache_items_list.push_front(key_value_pair_t(key, value)); + _cache_items_map[key] = _cache_items_list.begin(); + if (_cache_items_map.size() > _max_size) { + _cache_items_map.erase(_cache_items_list.back().first); + _cache_items_list.pop_back(); + } + } + +private: // data + std::list _cache_items_list; + std::map _cache_items_map; + size_t _max_size; +}; + +} // namespace zim + +#endif /* _LRUCACHE_HPP_INCLUDED_ */ diff --git a/src/md5.c b/src/md5.c new file mode 100644 index 0000000..bae002e --- /dev/null +++ b/src/md5.c @@ -0,0 +1,340 @@ +/* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + */ + +/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +rights reserved. + +License to copy and use this software is granted provided that it +is identified as the "RSA Data Security, Inc. MD5 Message-Digest +Algorithm" in all material mentioning or referencing this software +or this function. + +License is also granted to make and use derivative works provided +that such works are identified as "derived from the RSA Data +Security, Inc. MD5 Message-Digest Algorithm" in all material +mentioning or referencing the derived work. + +RSA Data Security, Inc. makes no representations concerning either +the merchantability of this software or the suitability of this +software for any particular purpose. It is provided "as is" +without express or implied warranty of any kind. + +These notices must be retained in any copies of any part of this +documentation and/or software. + */ + +#include "md5.h" +#include + +#define MD5_CTX struct zim_MD5_CTX + +/* Constants for MD5Transform routine. + */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + +static void MD5Transform PROTO_LIST ((UINT4 [4], const unsigned char [64])); +static void Encode PROTO_LIST + ((unsigned char *, UINT4 *, unsigned int)); +static void Decode PROTO_LIST + ((UINT4 *, const unsigned char *, unsigned int)); +/* +static void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int)); +static void MD5_memset PROTO_LIST ((POINTER, int, unsigned int)); +*/ +#define MD5_memcpy memcpy +#define MD5_memset memset + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G, H and I are basic MD5 functions. + */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits. + */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +Rotation is separate from addition to prevent recomputation. + */ +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +/* MD5 initialization. Begins an MD5 operation, writing a new context. + */ +void zim_MD5Init (MD5_CTX* context) +{ + context->count[0] = context->count[1] = 0; + /* Load magic initialization constants. +*/ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* MD5 block update operation. Continues an MD5 message-digest + operation, processing another message block, and updating the + context. + */ +void zim_MD5Update ( +MD5_CTX *context, +const unsigned char *input, /* input block */ +unsigned int inputLen) /* length of input block */ +{ + unsigned int i, index, partLen; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + + /* Update number of bits */ + if ((context->count[0] += ((UINT4)inputLen << 3)) + < ((UINT4)inputLen << 3)) + context->count[1]++; + context->count[1] += ((UINT4)inputLen >> 29); + + partLen = 64 - index; + + /* Transform as many times as possible. +*/ + if (inputLen >= partLen) { + MD5_memcpy + ((POINTER)&context->buffer[index], (POINTER)input, partLen); + MD5Transform (context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + MD5_memcpy + ((POINTER)&context->buffer[index], (POINTER)&input[i], + inputLen-i); +} + +/* MD5 finalization. Ends an MD5 message-digest operation, writing the + the message digest and zeroizing the context. + */ +void zim_MD5Final ( +unsigned char digest[16], /* message digest */ +MD5_CTX *context) /* context */ +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + Encode (bits, context->count, 8); + + /* Pad out to 56 mod 64. +*/ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + zim_MD5Update (context, PADDING, padLen); + + /* Append length (before padding) */ + zim_MD5Update (context, bits, 8); + /* Store state in digest */ + Encode (digest, context->state, 16); + + /* Zeroize sensitive information. +*/ + MD5_memset ((POINTER)context, 0, sizeof (*context)); +} + +/* MD5 basic transformation. Transforms state based on block. + */ +static void MD5Transform ( +UINT4 state[4], +const unsigned char block[64]) +{ + UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. +*/ + MD5_memset ((POINTER)x, 0, sizeof (x)); +} + +/* Encodes input (UINT4) into output (unsigned char). Assumes len is + a multiple of 4. + */ +static void Encode ( +unsigned char *output, +UINT4 *input, +unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } +} + +/* Decodes input (unsigned char) into output (UINT4). Assumes len is + a multiple of 4. + */ +static void Decode ( +UINT4 *output, +const unsigned char *input, +unsigned int len) +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | + (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); +} + +#if 0 +/* Note: Replace "for loop" with standard memcpy if possible. + */ + +static void MD5_memcpy ( +POINTER output, +POINTER input, +unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + output[i] = input[i]; +} + +/* Note: Replace "for loop" with standard memset if possible. + */ +static void MD5_memset ( +POINTER output, +int value, +unsigned int len) +{ + unsigned int i; + + for (i = 0; i < len; i++) + ((char *)output)[i] = (char)value; +} +#endif diff --git a/src/md5.h b/src/md5.h new file mode 100644 index 0000000..29bdc39 --- /dev/null +++ b/src/md5.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2003 Tommi Maekitalo + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * As a special exception, you may use this file as part of a free + * software library without restriction. Specifically, if other files + * instantiate templates or use macros or inline functions from this + * file, or you compile this file and link it with other files to + * produce an executable, this file does not by itself cause the + * resulting executable to be covered by the GNU General Public + * License. This exception does not however invalidate any other + * reasons why the executable file might be covered by the GNU Library + * General Public License. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +rights reserved. + +License to copy and use this software is granted provided that it +is identified as the "RSA Data Security, Inc. MD5 Message-Digest +Algorithm" in all material mentioning or referencing this software +or this function. + +License is also granted to make and use derivative works provided +that such works are identified as "derived from the RSA Data +Security, Inc. MD5 Message-Digest Algorithm" in all material +mentioning or referencing the derived work. + +RSA Data Security, Inc. makes no representations concerning either +the merchantability of this software or the suitability of this +software for any particular purpose. It is provided "as is" +without express or implied warranty of any kind. + +These notices must be retained in any copies of any part of this +documentation and/or software. + */ + +/* RSAREF types and constants + */ + +/* PROTOTYPES should be set to one if and only if the compiler supports + function argument prototyping. +The following makes PROTOTYPES default to 0 if it has not already + been defined with C compiler flags. + */ + +#ifndef ZIM_MD5_H +#define ZIM_MD5_H + +#ifndef PROTOTYPES +#define PROTOTYPES 1 +#endif + +/* POINTER defines a generic pointer type */ +typedef unsigned char *POINTER; + +/* UINT2 defines a two byte word */ +typedef unsigned short int UINT2; + +/* UINT4 defines a four byte word */ +typedef unsigned int UINT4; + +/* PROTO_LIST is defined depending on how PROTOTYPES is defined above. + If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it + returns an empty list. + */ + +#if PROTOTYPES +#define PROTO_LIST(list) list +#else +#define PROTO_LIST(list) () +#endif + +/* MD5 context. */ +struct zim_MD5_CTX { + UINT4 state[4]; /* state (ABCD) */ + UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ + unsigned char buffer[64]; /* input buffer */ +}; + +#ifdef __cplusplus +extern "C" { +#endif + +void zim_MD5Init PROTO_LIST ((struct zim_MD5_CTX *)); +void zim_MD5Update PROTO_LIST + ((struct zim_MD5_CTX *, const unsigned char *, unsigned int)); +void zim_MD5Final PROTO_LIST ((unsigned char [16], struct zim_MD5_CTX *)); + +#ifdef __cplusplus +} +#endif + +#endif /* ZIM_MD5_H */ diff --git a/src/meson.build b/src/meson.build new file mode 100644 index 0000000..4529b7c --- /dev/null +++ b/src/meson.build @@ -0,0 +1,82 @@ + +configure_file(output : 'config.h', + configuration : private_conf, + input : 'config.h.in') + +src_directory = include_directories('.') + +common_sources = [ +# 'config.h', + 'archive.cpp', + 'cluster.cpp', + 'buffer_reader.cpp', + 'dirent.cpp', + 'dirent_accessor.cpp', + 'entry.cpp', + 'envvalue.cpp', + 'fileheader.cpp', + 'fileimpl.cpp', + 'file_compound.cpp', + 'file_reader.cpp', + 'item.cpp', + 'blob.cpp', + 'buffer.cpp', + 'md5.c', + 'template.cpp', + 'uuid.cpp', + 'tools.cpp', + 'compression.cpp', + 'istreamreader.cpp', + 'writer/contentProvider.cpp', + 'writer/creator.cpp', + 'writer/item.cpp', + 'writer/cluster.cpp', + 'writer/dirent.cpp', + 'writer/workers.cpp', + 'writer/clusterWorker.cpp', + 'writer/titleListingHandler.cpp', + 'writer/counterHandler.cpp', + 'suggestion.cpp', + 'suggestion_iterator.cpp', + 'version.cpp' +] + +if host_machine.system() == 'windows' + common_sources += 'fs_windows.cpp' +else + common_sources += 'fs_unix.cpp' +endif + +xapian_sources = [ + 'search.cpp', + 'search_iterator.cpp', + 'xapian/htmlparse.cc', + 'xapian/myhtmlparse.cc', + 'writer/xapianIndexer.cpp', + 'writer/xapianWorker.cpp', + 'writer/xapianHandler.cpp' +] + +sources = common_sources +deps = [thread_dep, lzma_dep, zstd_dep] + +if target_machine.system() == 'freebsd' + deps += [execinfo_dep] +endif + +if xapian_dep.found() + sources += xapian_sources + sources += lib_resources + deps += [xapian_dep, icu_dep] +endif + +libzim = library('zim', + sources, + include_directories : inc, + dependencies : deps, + link_args : extra_link_args, + cpp_args : extra_cpp_args, + version: meson.project_version(), + install : true) +libzim_dep = declare_dependency(link_with: libzim, + include_directories: include_directory) diff --git a/src/narrowdown.h b/src/narrowdown.h new file mode 100644 index 0000000..361a078 --- /dev/null +++ b/src/narrowdown.h @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_NARROWDOWN_H +#define ZIM_NARROWDOWN_H + +#include "zim_types.h" +#include "debug.h" + +#include +#include + +#include + +namespace zim +{ + +// Given a sorted sequence of items with a string key, NarrowDown helps to +// narrow down the range in which the query key should belong. +// +// The target usage of this class is as a partial in-memory index for a sorted +// list residing in external storage with high access cost to inidividual items. +// +// Illustration: +// +// In RAM: +// key: A I Q Y g o w z +// item #: | | | | | | | | +// ----------- | | | | | | | | +// On disk: V V V V V V V V +// key: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz +// data: ajo097124ljp-oasd)(&(*)llkjasdf@$^nFDSs00ujlasdfjkll +// +// In such an external list looking up an item by key can be performed via a +// binary search where on each iteration the item key must be accessed. There +// are two performance problems with that: +// 1. The API may not allow accessing only the key of the given item, reading +// the entire item instead (this is the case with dirents). +// 2. Access to items (or only their keys) in external storage is expensive. +// +// NarrowDown speeds up the look-up operation in such an external list by +// allowing to split it into two steps: +// 1. Perform the binary search on the index, yielding a narrower range +// 2. Perform the binary search on the external list starting from that +// narrower range. +// +// The denser the in-memory index the more the performance improvement. +// Therefore the implementation focus of NarrowDown is on small memory +// footprint. If the item keys are long strings with a lot of "garbage" at the +// end the following trick helps. Suppose that we have the following pair of +// adjacent keys in our full (external) list: +// +// Item # | Key +// --------------------------------- +// ... | ... +// 1234 | "We Are The Champions" +// 1235 | "We Will Rock You" +// ... | ... +// +// If we were to include the item #1234 in our index the naive approach would +// be to store its key as is. However, let's imagine that the list also +// contains an item with key "We W". Then it would have to reside between "We +// Are The Champions" and "We Will Rock You". So we can pretend that such an +// item exists and store in our index the fictitious entry {"We W", 1234.5}. +// When we arrive at that entry during the range narrow-down step we must round +// the item index downward if it is going to be used as the lower bound of +// the range, and round it upward if it is going to be used as the upper bound +// of the range. +class NarrowDown +{ + typedef entry_index_type index_type; + +public: // types + struct Range + { + const index_type begin, end; + }; + +public: // functions + NarrowDown() + : pred(&keyContentArea) + {} + + // Add another entry to the search index. The key of the next item is used + // to derive and store a shorter pseudo-key as explained in the long comment + // above the class. + void add(const std::string& key, index_type i, const std::string& nextKey) + { + // It would be better to have `key >= nextKey`, but pretty old zim file were not enforce to + // have unique url, just that entries were sorted by url, but two entries could have the same url. + // It is somehow a bug and have been fixed then, but we still have to be tolerent here and accept that + // two concecutive keys can be equal. + if (key > nextKey) { + std::stringstream ss; + ss << "Dirent table is not properly sorted:\n"; + ss << " #" << i << ": " << key[0] << "/" << key.substr(1) << "\n"; + ss << " #" << i+1 << ": " << nextKey[0] << "/" << nextKey.substr(1); + throw ZimFileFormatError(ss.str()); + } + if ( entries.empty() ) { + addEntry(key, i); + } + else + { + const std::string pseudoKey = shortestStringInBetween(key, nextKey); + if (pred(pseudoKey, entries.back())) { + std::stringstream ss; + ss << "Dirent table is not properly sorted:\n"; + ss << "PseudoKey " << pseudoKey << " should be after (or equal) previously generated " << pred.getKeyContent(entries.back()) << "\n"; + throw ZimFileFormatError(ss.str()); + } + ASSERT(entries.back().lindex, <, i); + addEntry(pseudoKey, i); + } + } + + void close(const std::string& key, index_type i) + { + ASSERT(entries.empty() || pred(entries.back(), key), ==, true); + ASSERT(entries.empty() || entries.back().lindex < i, ==, true); + addEntry(key, i); + } + + Range getRange(const std::string& key) const + { + auto it = std::upper_bound(entries.begin(), entries.end(), key, pred); + if ( it == entries.begin() ) + return {0, 0}; + + const index_type prevEntryLindex = (it-1)->lindex; + + if ( it == entries.end() ) + return {prevEntryLindex, prevEntryLindex+1}; + + return {prevEntryLindex, it->lindex+1}; + } + + static std::string shortestStringInBetween(const std::string& a, const std::string& b) + { + ASSERT(a, <=, b); + + // msvc version of `std::mismatch(begin1, end1, begin2)` + // need `begin2 + (end1-begin1)` to be valid. + // So we cannot simply pass `a.end()` as `end1`. + const auto minlen = std::min(a.size(), b.size()); + const auto m = std::mismatch(a.begin(), a.begin()+minlen, b.begin()); + return std::string(b.begin(), std::min(b.end(), m.second+1)); + } + +private: // functions + void addEntry(const std::string& s, index_type i) + { + entries.push_back({uint32_t(keyContentArea.size()), i}); + keyContentArea.insert(keyContentArea.end(), s.begin(), s.end()); + keyContentArea.push_back('\0'); + } + +private: // types + typedef std::vector KeyContentArea; + + struct Entry + { + // This is mostly a truncated version of a key from the input sequence. + // The exceptions are + // - the first item + // - the last item + // - keys that differ from their preceding key only in the last character + // + // std::string pseudoKey; // std::string has too much memory overhead. + uint32_t pseudoKeyOffset; // Instead we densely pack the key contents + // into keyContentArea and store in the entry + // the offset into that container. + + // This represents the index of the item in the input sequence right + // after which pseudoKey might be inserted without breaking the sequence + // order. In other words, the condition + // + // sequence[lindex] <= pseudoKey <= sequence[lindex+1] + // + // must be true. + index_type lindex; + }; + + struct LookupPred + { + const KeyContentArea& keyContentArea; + + explicit LookupPred(const KeyContentArea* kca) + : keyContentArea(*kca) + {} + + const char* getKeyContent(const Entry& entry) const + { + return &keyContentArea[entry.pseudoKeyOffset]; + } + + bool operator()(const Entry& entry, const std::string& key) const + { + return key.compare(getKeyContent(entry)) >= 0; + } + + bool operator()(const std::string& key, const Entry& entry) const + { + return key.compare(getKeyContent(entry)) < 0; + } + }; + + typedef std::vector EntryCollection; + +private: // data + // Used to store the (shortened) keys as densely packed C-style strings + KeyContentArea keyContentArea; + + LookupPred pred; + + EntryCollection entries; +}; + +} // namespace zim + +#endif // ZIM_NARROWDOWN_H diff --git a/src/rawstreamreader.h b/src/rawstreamreader.h new file mode 100644 index 0000000..43596fc --- /dev/null +++ b/src/rawstreamreader.h @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_RAWSTREAMREADER_H +#define ZIM_RAWSTREAMREADER_H + +#include "istreamreader.h" +#include "reader.h" +#include "debug.h" + +namespace zim +{ + +class RawStreamReader : public IStreamReader +{ +public: // functions + explicit RawStreamReader(std::shared_ptr reader) + : m_reader(reader), + m_readerPos(0) + {} + + void readImpl(char* buf, zsize_t nbytes) override + { + m_reader->read(buf, m_readerPos, zsize_t(nbytes)); + m_readerPos += nbytes; + } + + std::unique_ptr sub_reader(zsize_t nbytes) override + { + auto reader = m_reader->sub_reader(m_readerPos, nbytes); + m_readerPos += nbytes; + return reader; + } + + +private: // data + std::shared_ptr m_reader; + offset_t m_readerPos; +}; + +} // namespace zim + +#endif // ZIM_READERDATASTREAMWRAPPER_H diff --git a/src/reader.h b/src/reader.h new file mode 100644 index 0000000..767b5e2 --- /dev/null +++ b/src/reader.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2017-2020 Matthieu Gautier + * Copyright (C) 2020 Veloman Yunkan + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_READER_H_ +#define ZIM_READER_H_ + +#include + +#include "zim_types.h" +#include "endian_tools.h" +#include "debug.h" + +#include "buffer.h" + +namespace zim { + +class Reader { + public: + Reader() {}; + virtual zsize_t size() const = 0; + virtual ~Reader() {}; + + virtual void read(char* dest, offset_t offset, zsize_t size) const = 0; + template + T read_uint(offset_t offset) const { + ASSERT(offset.v, <, size().v); + ASSERT(offset.v+sizeof(T), <=, size().v); + char tmp_buf[sizeof(T)]; + read(tmp_buf, offset, zsize_t(sizeof(T))); + return fromLittleEndian(tmp_buf); + } + virtual char read(offset_t offset) const = 0; + + virtual const Buffer get_buffer(offset_t offset, zsize_t size) const = 0; + const Buffer get_buffer(offset_t offset) const { + return get_buffer(offset, zsize_t(size().v-offset.v)); + } + virtual std::unique_ptr sub_reader(offset_t offset, zsize_t size) const = 0; + std::unique_ptr sub_reader(offset_t offset) const { + return sub_reader(offset, zsize_t(size().v-offset.v)); + } + virtual offset_t offset() const = 0; + + bool can_read(offset_t offset, zsize_t size) const; +}; + +}; + +#endif // ZIM_READER_H_ diff --git a/src/search.cpp b/src/search.cpp new file mode 100644 index 0000000..e0ed54d --- /dev/null +++ b/src/search.cpp @@ -0,0 +1,348 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2021 Veloman Yunkan + * Copyright (C) 2020 Emmanuel Engelhart + * Copyright (C) 2018 Kunal Mehta + * Copyright (C) 2007 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include "fileimpl.h" +#include "search_internal.h" +#include "fs.h" +#include "tools.h" + +#include + +#include +#include +#if !defined(_WIN32) +# include +#else +# include +#endif +#include + +#include "xapian.h" +#include + +#include "constants.h" + +#define MAX_MATCHES_TO_SORT 10000 + +namespace zim +{ + +InternalDataBase::InternalDataBase(const std::vector& archives, bool verbose) + : m_verbose(verbose) +{ + bool first = true; + m_queryParser.set_database(m_database); + m_queryParser.set_default_op(Xapian::Query::op::OP_AND); + + for(auto& archive: archives) { + auto impl = archive.getImpl(); + FileImpl::FindxResult r; + r = impl->findx('X', "fulltext/xapian"); + if (!r.first) { + r = impl->findx('Z', "/fulltextIndex/xapian"); + } + if (!r.first) { + continue; + } + auto xapianEntry = Entry(impl, entry_index_type(r.second)); + auto accessInfo = xapianEntry.getItem().getDirectAccessInformation(); + if (accessInfo.second == 0) { + continue; + } + + Xapian::Database database; + if (!getDbFromAccessInfo(accessInfo, database)) { + continue; + } + + if ( first ) { + m_valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + auto language = database.get_metadata("language"); + if (language.empty() ) { + // Database created before 2017/03 has no language metadata. + // However, term were stemmed anyway and we need to stem our + // search query the same the database was created. + // So we need a language, let's use the one of the zim. + // If zimfile has no language metadata, we can't do lot more here :/ + try { + language = archive.getMetadata("Language"); + } catch(...) {} + } + if (!language.empty()) { + icu::Locale languageLocale(language.c_str()); + /* Configuring language base steemming */ + try { + m_stemmer = Xapian::Stem(languageLocale.getLanguage()); + m_queryParser.set_stemmer(m_stemmer); + m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL); + } catch (...) { + std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl; + } + } + auto stopwords = database.get_metadata("stopwords"); + if ( !stopwords.empty() ){ + std::string stopWord; + std::istringstream file(stopwords); + Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper(); + while (std::getline(file, stopWord, '\n')) { + stopper->add(stopWord); + } + stopper->release(); + m_queryParser.set_stopper(stopper); + } + } else { + std::map valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + if (m_valuesmap != valuesmap ) { + // [TODO] Ignore the database, raise a error ? + } + } + m_xapianDatabases.push_back(database); + m_database.add_database(database); + m_archives.push_back(archive); + first = false; + } +} + +bool InternalDataBase::hasDatabase() const +{ + return !m_xapianDatabases.empty(); +} + +bool InternalDataBase::hasValuesmap() const +{ + return !m_valuesmap.empty(); +} + +bool InternalDataBase::hasValue(const std::string& valueName) const +{ + return (m_valuesmap.find(valueName) != m_valuesmap.end()); +} + +int InternalDataBase::valueSlot(const std::string& valueName) const +{ + return m_valuesmap.at(valueName); +} + +Xapian::Query InternalDataBase::parseQuery(const Query& query) +{ + Xapian::Query xquery; + + xquery = m_queryParser.parse_query(query.m_query); + + if (query.m_geoquery && hasValue("geo.position")) { + Xapian::GreatCircleMetric metric; + Xapian::LatLongCoord centre(query.m_latitude, query.m_longitude); + Xapian::LatLongDistancePostingSource ps(valueSlot("geo.position"), centre, metric, query.m_distance); + Xapian::Query geoQuery(&ps); + if (query.m_query.empty()) { + xquery = geoQuery; + } else { + xquery = Xapian::Query(Xapian::Query::OP_FILTER, xquery, geoQuery); + } + } + + return xquery; +} + +Searcher::Searcher(const std::vector& archives) : + mp_internalDb(nullptr), + m_verbose(false) +{ + for ( const auto& a : archives ) { + addArchive(a); + } +} + +Searcher::Searcher(const Archive& archive) : + mp_internalDb(nullptr), + m_verbose(false) +{ + addArchive(archive); +} + +Searcher::Searcher(const Searcher& other) = default; +Searcher& Searcher::operator=(const Searcher& other) = default; +Searcher::Searcher(Searcher&& other) = default; +Searcher& Searcher::operator=(Searcher&& other) = default; +Searcher::~Searcher() = default; + +namespace +{ + +bool archivesAreEquivalent(const Archive& a1, const Archive& a2) +{ + return a1.getUuid() == a2.getUuid(); +} + +bool contains(const std::vector& archives, const Archive& newArchive) +{ + for ( const auto& a : archives ) { + if ( archivesAreEquivalent(a, newArchive) ) { + return true; + } + } + return false; +} + +} // unnamed namespace + +Searcher& Searcher::addArchive(const Archive& archive) { + if ( !contains(m_archives, archive) ) { + m_archives.push_back(archive); + mp_internalDb.reset(); + } + return *this; +} + +Search Searcher::search(const Query& query) +{ + if (!mp_internalDb) { + initDatabase(); + } + + if (!mp_internalDb->hasDatabase()) { + throw(std::runtime_error("Cannot create Search without FT Xapian index")); + } + + return Search(mp_internalDb, query); +} + +void Searcher::setVerbose(bool verbose) +{ + m_verbose = verbose; +} + +void Searcher::initDatabase() +{ + mp_internalDb = std::make_shared(m_archives, m_verbose); +} + +Search::Search(std::shared_ptr p_internalDb, const Query& query) + : mp_internalDb(p_internalDb), + mp_enquire(nullptr), + m_query(query) +{ +} + +Search::Search(Search&& s) = default; +Search& Search::operator=(Search&& s) = default; +Search::~Search() = default; + +Query::Query(const std::string& query) : + m_query(query) +{} + +Query& Query::setQuery(const std::string& query) { + m_query = query; + return *this; +} + +Query& Query::setGeorange(float latitude, float longitude, float distance) { + m_latitude = latitude; + m_longitude = longitude; + m_distance = distance; + m_geoquery = true; + return *this; +} + +int Search::getEstimatedMatches() const +{ + try { + auto enquire = getEnquire(); + // Force xapian to check at least 10 documents even if we ask for an empty mset. + // Else, the get_matches_estimated may be wrong and return 0 even if we have results. + auto mset = enquire.get_mset(0, 0, 10); + return mset.get_matches_estimated(); + } catch(Xapian::QueryParserError& e) { + return 0; + } +} + +const SearchResultSet Search::getResults(int start, int maxResults) const { + try { + auto enquire = getEnquire(); + auto mset = enquire.get_mset(start, maxResults); + return SearchResultSet(mp_internalDb, std::move(mset)); + } catch(Xapian::QueryParserError& e) { + return SearchResultSet(mp_internalDb); + } +} + +Xapian::Enquire& Search::getEnquire() const +{ + if ( mp_enquire ) { + return *mp_enquire; + } + + auto enquire = std::unique_ptr(new Xapian::Enquire(mp_internalDb->m_database)); + + auto query = mp_internalDb->parseQuery(m_query); + if (mp_internalDb->m_verbose) { + std::cout << "Parsed query '" << m_query.m_query << "' to " << query.get_description() << std::endl; + } + enquire->set_query(query); + + mp_enquire = std::move(enquire); + return *mp_enquire; +} + + +SearchResultSet::SearchResultSet(std::shared_ptr p_internalDb, Xapian::MSet&& mset) : + mp_internalDb(p_internalDb), + mp_mset(std::make_shared(mset)) +{} + +SearchResultSet::SearchResultSet(std::shared_ptr p_internalDb) : + mp_internalDb(p_internalDb), + mp_mset(nullptr) +{} + +int SearchResultSet::size() const +{ + if (! mp_mset) { + return 0; + } + return mp_mset->size(); +} + +SearchResultSet::iterator SearchResultSet::begin() const +{ + if ( ! mp_mset ) { + return nullptr; + } + return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->begin()); +} + +SearchResultSet::iterator SearchResultSet::end() const +{ + if ( ! mp_mset ) { + return nullptr; + } + return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->end()); +} + +} //namespace zim diff --git a/src/search_internal.h b/src/search_internal.h new file mode 100644 index 0000000..c9af919 --- /dev/null +++ b/src/search_internal.h @@ -0,0 +1,146 @@ +/* + * Copyright (C) 2021 Manneesh P M + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SEARCH_INTERNAL_H +#define ZIM_SEARCH_INTERNAL_H + +#include + +#include +#include + +namespace zim { + +/** + * A class to encapsulate a xapian database and all the information we can gather from it. + */ +class InternalDataBase { + public: // methods + InternalDataBase(const std::vector& archives, bool verbose); + bool hasDatabase() const; + bool hasValuesmap() const; + bool hasValue(const std::string& valueName) const; + int valueSlot(const std::string& valueName) const; + + Xapian::Query parseQuery(const Query& query); + + public: // data + // The (main) database we will search on (wrapping other xapian databases). + Xapian::Database m_database; + + // The real databases. + std::vector m_xapianDatabases; + + // The archives we are searching on. + std::vector m_archives; + + // The valuesmap associated with the database. + std::map m_valuesmap; + + // If the database is open for suggestion. + // True even if the dabase has no newSuggestionformat. + bool m_suggestionMode; + + // The query parser corresponding to the database. + Xapian::QueryParser m_queryParser; + + // The stemmer used to parse queries + Xapian::Stem m_stemmer; + + // Verbosity of operations. + bool m_verbose; +}; + +struct SearchIterator::InternalData { + std::shared_ptr mp_internalDb; + std::shared_ptr mp_mset; + Xapian::MSetIterator iterator; + Xapian::Document _document; + bool document_fetched; + std::unique_ptr _entry; + + InternalData(const InternalData& other) : + mp_internalDb(other.mp_internalDb), + mp_mset(other.mp_mset), + iterator(other.iterator), + _document(other._document), + document_fetched(other.document_fetched), + _entry(other._entry ? new Entry(*other._entry) : nullptr ) + { + } + + InternalData& operator=(const InternalData& other) + { + if (this != &other) { + mp_internalDb = other.mp_internalDb; + mp_mset = other.mp_mset; + iterator = other.iterator; + _document = other._document; + document_fetched = other.document_fetched; + _entry.reset(other._entry ? new Entry(*other._entry) : nullptr); + } + return *this; + } + + InternalData(std::shared_ptr p_internalDb, std::shared_ptr p_mset, Xapian::MSetIterator iterator) : + mp_internalDb(p_internalDb), + mp_mset(p_mset), + iterator(iterator), + document_fetched(false) + {}; + + Xapian::Document get_document() { + if ( !document_fetched ) { + if (iterator == mp_mset->end()) { + throw std::runtime_error("Cannot get entry for end iterator"); + } + _document = iterator.get_document(); + document_fetched = true; + } + return _document; + } + + int get_databasenumber() { + Xapian::docid docid = *iterator; + return (docid - 1) % mp_internalDb->m_archives.size(); + } + + Entry& get_entry() { + if ( !_entry ) { + int databasenumber = get_databasenumber(); + auto archive = mp_internalDb->m_archives.at(databasenumber); + _entry.reset(new Entry(archive.getEntryByPath(get_document().get_data()))); + } + return *_entry.get(); + } + + bool operator==(const InternalData& other) const { + return (mp_internalDb == other.mp_internalDb + && mp_mset == other.mp_mset + && iterator == other.iterator); + } +}; + + + +}; //namespace zim + +#endif //ZIM_SEARCH_INTERNAL_H diff --git a/src/search_iterator.cpp b/src/search_iterator.cpp new file mode 100644 index 0000000..0c5cbca --- /dev/null +++ b/src/search_iterator.cpp @@ -0,0 +1,245 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2017-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#define ZIM_PRIVATE + +#include "xapian/myhtmlparse.h" +#include +#include +#include +#include +#include "search_internal.h" + +namespace zim { + + +SearchIterator::~SearchIterator() = default; +SearchIterator::SearchIterator(SearchIterator&& it) = default; +SearchIterator& SearchIterator::operator=(SearchIterator&& it) = default; + +SearchIterator::SearchIterator() : SearchIterator(nullptr) +{}; + +SearchIterator::SearchIterator(InternalData* internal_data) + : internal(internal_data) +{} + +SearchIterator::SearchIterator(const SearchIterator& it) + : internal(nullptr) +{ + if (it.internal) internal = std::unique_ptr(new InternalData(*it.internal)); +} + +SearchIterator & SearchIterator::operator=(const SearchIterator& it) { + if ( ! it.internal ) internal.reset(); + else if ( ! internal ) internal = std::unique_ptr(new InternalData(*it.internal)); + else *internal = *it.internal; + + return *this; +} + +bool SearchIterator::operator==(const SearchIterator& it) const { + if ( ! internal && ! it.internal) { + return true; + } + if ( ! internal || ! it.internal) { + return false; + } + return (*internal == *it.internal); +} + +bool SearchIterator::operator!=(const SearchIterator& it) const { + return ! (*this == it); +} + +SearchIterator& SearchIterator::operator++() { + if ( ! internal ) { + return *this; + } + ++(internal->iterator); + internal->document_fetched = false; + internal->_entry.reset(); + return *this; +} + +SearchIterator SearchIterator::operator++(int) { + SearchIterator it = *this; + operator++(); + return it; +} + +SearchIterator& SearchIterator::operator--() { + if ( ! internal ) { + return *this; + } + --(internal->iterator); + internal->document_fetched = false; + internal->_entry.reset(); + return *this; +} + +SearchIterator SearchIterator::operator--(int) { + SearchIterator it = *this; + operator--(); + return it; +} + +std::string SearchIterator::getPath() const { + if ( ! internal ) { + return ""; + } + + std::string path = internal->get_document().get_data(); + bool hasNewNamespaceScheme = internal->mp_internalDb->m_archives.at(getFileIndex()).hasNewNamespaceScheme(); + + std::string dbDataType = internal->mp_internalDb->m_database.get_metadata("data"); + if (dbDataType.empty()) { + dbDataType = "fullPath"; + } + + // If the archive has new namespace scheme and the type of its indexed data + // is `fullPath` we return only the `path` without namespace + if (hasNewNamespaceScheme && dbDataType == "fullPath") { + path = path.substr(2); + } + return path; +} + +std::string SearchIterator::getDbData() const { + if ( ! internal ) { + return ""; + } + + return internal->get_document().get_data(); +} + +std::string SearchIterator::getTitle() const { + if ( ! internal ) { + return ""; + } + return internal->get_entry().getTitle(); +} + +int SearchIterator::getScore() const { + if ( ! internal ) { + return 0; + } + return internal->iterator.get_percent(); +} + +std::string SearchIterator::getSnippet() const { + if ( ! internal ) { + return ""; + } + + // Generate full text snippet + if ( ! internal->mp_internalDb->hasValuesmap() ) + { + /* This is the old legacy version. Guess and try */ + std::string stored_snippet = internal->get_document().get_value(1); + if ( ! stored_snippet.empty() ) + return stored_snippet; + /* Let's continue here, and see if we can genenate one */ + } + else if ( internal->mp_internalDb->hasValue("snippet") ) + { + return internal->get_document().get_value(internal->mp_internalDb->valueSlot("snippet")); + } + /* No reader, no snippet */ + try { + Entry& entry = internal->get_entry(); + /* Get the content of the item to generate a snippet. + We parse it and use the html dump to avoid remove html tags in the + content and be able to nicely cut the text at random place. */ + zim::MyHtmlParser htmlParser; + std::string content = entry.getItem().getData(); + try { + htmlParser.parse_html(content, "UTF-8", true); + } catch (...) {} + return internal->mp_mset->snippet(htmlParser.dump, + /*length=*/500, + /*stemmer=*/internal->mp_internalDb->m_stemmer, + /*flags=*/0); + } catch (...) { + return ""; + } +} + +int SearchIterator::getSize() const { + if ( ! internal ) { + return -1; + } + if ( ! internal->mp_internalDb->hasValuesmap() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(2).empty() == true ? -1 : atoi(internal->get_document().get_value(2).c_str()); + } + else if ( internal->mp_internalDb->hasValue("size") ) + { + return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("size")).c_str()); + } + /* The size is never used. Do we really want to get the content and + calculate the size ? */ + return -1; +} + +int SearchIterator::getWordCount() const { + if ( ! internal ) { + return -1; + } + if ( ! internal->mp_internalDb->hasValuesmap() ) + { + /* This is the old legacy version. Guess and try */ + return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str()); + } + else if ( internal->mp_internalDb->hasValue("wordcount") ) + { + return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("wordcount")).c_str()); + } + return -1; +} + +int SearchIterator::getFileIndex() const { + if ( internal ) { + return internal->get_databasenumber(); + } + return 0; +} + +Uuid SearchIterator::getZimId() const { + if (! internal ) { + throw std::runtime_error("Cannot get zimId from uninitialized iterator"); + } + return internal->mp_internalDb->m_archives.at(getFileIndex()).getUuid(); +} + +SearchIterator::reference SearchIterator::operator*() const { + if (! internal ) { + throw std::runtime_error("Cannot get a entry for a uninitialized iterator"); + } + return internal->get_entry(); +} + +SearchIterator::pointer SearchIterator::operator->() const { + return &**this; +} + + +} // namespace zim diff --git a/src/suggestion.cpp b/src/suggestion.cpp new file mode 100644 index 0000000..ab4dc6e --- /dev/null +++ b/src/suggestion.cpp @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2021 Maneesh P M + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#define ZIM_PRIVATE + +#include +#include +#include "suggestion_internal.h" +#include +#include "fileimpl.h" +#include "tools.h" +#include "constants.h" + +#if defined(ENABLE_XAPIAN) +#include +#endif // ENABLE_XAPIAN + +namespace zim +{ + +SuggestionDataBase::SuggestionDataBase(const Archive& archive, bool verbose) + : m_archive(archive), + m_verbose(verbose) +{ +// Initialize Xapian DB if it is enabled +#if defined(ENABLE_XAPIAN) + initXapianDb(); +#endif // ENABLE_XAPIAN +} + +#if defined(ENABLE_XAPIAN) +void SuggestionDataBase::initXapianDb() { + m_queryParser.set_database(m_database); + m_queryParser.set_default_op(Xapian::Query::op::OP_AND); + + auto impl = m_archive.getImpl(); + FileImpl::FindxResult r; + + r = impl->findx('X', "title/xapian"); + if (!r.first) { + return; + } + + auto xapianEntry = Entry(impl, entry_index_type(r.second)); + auto accessInfo = xapianEntry.getItem().getDirectAccessInformation(); + if (accessInfo.second == 0) { + return; + } + + Xapian::Database database; + if (!getDbFromAccessInfo(accessInfo, database)) { + return; + } + + m_valuesmap = read_valuesmap(database.get_metadata("valuesmap")); + auto language = database.get_metadata("language"); + if (language.empty() ) { + // Database created before 2017/03 has no language metadata. + // However, term were stemmed anyway and we need to stem our + // search query the same the database was created. + // So we need a language, let's use the one of the zim. + // If zimfile has no language metadata, we can't do lot more here :/ + try { + language = m_archive.getMetadata("Language"); + } catch(...) {} + } + if (!language.empty()) { + icu::Locale languageLocale(language.c_str()); + /* Configuring language base steemming */ + try { + m_stemmer = Xapian::Stem(languageLocale.getLanguage()); + m_queryParser.set_stemmer(m_stemmer); + } catch (...) { + std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl; + } + } + + m_database = database; +} + +bool SuggestionDataBase::hasDatabase() const +{ + return !m_database.internal.empty(); +} + +bool SuggestionDataBase::hasValuesmap() const +{ + return !m_valuesmap.empty(); +} + +bool SuggestionDataBase::hasValue(const std::string& valueName) const +{ + return (m_valuesmap.find(valueName) != m_valuesmap.end()); +} + +int SuggestionDataBase::valueSlot(const std::string& valueName) const +{ + return m_valuesmap.at(valueName); +} + +/* + * subquery_phrase: selects documents that have the terms in the order of the query + * within a specified window. + * subquery_anchored: selects documents that have the terms in the order of the + * query within a specified window and starts from the beginning of the document. + * subquery_and: selects documents that have all the terms in the query. + * + * subquery_phrase and subquery_anchored by themselves are quite exclusive. To + * include more "similar" docs, we combine them with subquery_and using OP_OR + * operator. If a particular document has a weight of A in subquery_and and B + * in subquery_phrase and C in subquery_anchored, the net weight of that document + * becomes A+B+C (normalised out of 100). So the documents closer to the query + * gets a higher relevance. + */ +Xapian::Query SuggestionDataBase::parseQuery(const std::string& query) +{ + std::lock_guard locker(m_mutex); + Xapian::Query xquery; + + const auto flags = Xapian::QueryParser::FLAG_DEFAULT | Xapian::QueryParser::FLAG_PARTIAL; + + // Reset stemming strategy for normal parsing + m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME); + xquery = m_queryParser.parse_query(query, flags); + + if (!query.empty()) { + // Reconfigure stemming strategy for phrase search + m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE); + + Xapian::Query subquery_phrase = m_queryParser.parse_query(query); + // Force the OP_PHRASE window to be equal to the number of terms. + subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length()); + + auto qs = ANCHOR_TERM + query; + Xapian::Query subquery_anchored = m_queryParser.parse_query(qs); + subquery_anchored = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_anchored.get_terms_begin(), subquery_anchored.get_terms_end(), subquery_anchored.get_length()); + + xquery = Xapian::Query(Xapian::Query::OP_OR, xquery, subquery_phrase); + xquery = Xapian::Query(Xapian::Query::OP_OR, xquery, subquery_anchored); + } + + return xquery; +} + +#endif // ENABLE_XAPIAN + +SuggestionSearcher::SuggestionSearcher(const Archive& archive) : + mp_internalDb(nullptr), + m_archive(archive), + m_verbose(false) +{} + +SuggestionSearcher::SuggestionSearcher(const SuggestionSearcher& other) = default; +SuggestionSearcher& SuggestionSearcher::operator=(const SuggestionSearcher& other) = default; +SuggestionSearcher::SuggestionSearcher(SuggestionSearcher&& other) = default; +SuggestionSearcher& SuggestionSearcher::operator=(SuggestionSearcher&& other) = default; +SuggestionSearcher::~SuggestionSearcher() = default; + +SuggestionSearch SuggestionSearcher::suggest(const std::string& query) +{ + if (!mp_internalDb) { + initDatabase(); + } + return SuggestionSearch(mp_internalDb, query); +} + +void SuggestionSearcher::setVerbose(bool verbose) +{ + m_verbose = verbose; +} + +void SuggestionSearcher::initDatabase() +{ + mp_internalDb = std::make_shared(m_archive, m_verbose); +} + +SuggestionSearch::SuggestionSearch(std::shared_ptr p_internalDb, const std::string& query) + : mp_internalDb(p_internalDb), + m_query(query) +#if defined(ENABLE_XAPIAN) + , mp_enquire(nullptr) +#endif // ENABLE_XAPIAN +{} + +SuggestionSearch::SuggestionSearch(SuggestionSearch&& s) = default; +SuggestionSearch& SuggestionSearch::operator=(SuggestionSearch&& s) = default; +SuggestionSearch::~SuggestionSearch() = default; + +int SuggestionSearch::getEstimatedMatches() const +{ +#if defined(ENABLE_XAPIAN) + if (mp_internalDb->hasDatabase()) { + try { + auto enquire = getEnquire(); + // Force xapian to check at least 10 documents even if we ask for an empty mset. + // Else, the get_matches_estimated may be wrong and return 0 even if we have results. + auto mset = enquire.get_mset(0, 0, 10); + return mset.get_matches_estimated(); + } catch(...) { + std::cerr << "Query Parsing failed, Switching to search without index." << std::endl; + } + } +#endif // ENABLE_XAPIAN + + return mp_internalDb->m_archive.findByTitle(m_query).size(); +} + +const SuggestionResultSet SuggestionSearch::getResults(int start, int maxResults) const { +#if defined(ENABLE_XAPIAN) + if (mp_internalDb->hasDatabase()) + { + try { + auto enquire = getEnquire(); + auto mset = enquire.get_mset(start, maxResults); + return SuggestionResultSet(mp_internalDb, std::move(mset)); + } catch(...) { + std::cerr << "Query Parsing failed, Switching to search without index." << std::endl; + } + } +#endif // ENABLE_XAPIAN + + auto entryRange = mp_internalDb->m_archive.findByTitle(m_query); + entryRange.offset(start, maxResults); + return SuggestionResultSet(entryRange); +} + +const void SuggestionSearch::forceRangeSuggestion() { +#if defined(ENABLE_XAPIAN) + mp_internalDb->m_database.close(); +#endif // ENABLE_XAPIAN +} + +#if defined(ENABLE_XAPIAN) +Xapian::Enquire& SuggestionSearch::getEnquire() const +{ + if ( mp_enquire ) { + return *mp_enquire; + } + + auto enquire = std::unique_ptr(new Xapian::Enquire(mp_internalDb->m_database)); + + const auto unaccentedQuery = removeAccents(m_query); + auto query = mp_internalDb->parseQuery(unaccentedQuery); + if (mp_internalDb->m_verbose) { + std::cout << "Parsed query '" << unaccentedQuery << "' to " << query.get_description() << std::endl; + } + enquire->set_query(query); + + /* + * In suggestion mode, we are searching over a separate title index. Default BM25 is not + * adapted for this case. WDF factor(k1) controls the effect of within document frequency. + * k1 = 0.001 reduces the effect of word repitition in document. In BM25, smaller documents + * get larger weights, so normalising the length of documents is necessary using b = 1. + * The document set is first sorted by their relevance score then by value so that suggestion + * results are closer to search string. + * refer https://xapian.org/docs/apidoc/html/classXapian_1_1BM25Weight.html + */ + + enquire->set_weighting_scheme(Xapian::BM25Weight(0.001,0,1,1,0.5)); + if (mp_internalDb->hasValue("title")) { + enquire->set_sort_by_relevance_then_value(mp_internalDb->valueSlot("title"), false); + } + + if (mp_internalDb->hasValue("targetPath")) { + enquire->set_collapse_key(mp_internalDb->valueSlot("targetPath")); + } + + mp_enquire = std::move(enquire); + return *mp_enquire; +} + +SuggestionResultSet::SuggestionResultSet(std::shared_ptr p_internalDb, Xapian::MSet&& mset) : + mp_internalDb(p_internalDb), + mp_entryRange(nullptr), + mp_mset(std::make_shared(mset)) +{} +#endif // ENABLE_XAPIAN + +SuggestionResultSet::SuggestionResultSet(EntryRange entryRange) : + mp_internalDb(nullptr), + mp_entryRange(std::unique_ptr(new EntryRange(entryRange))) +#if defined(ENABLE_XAPIAN) + , mp_mset(nullptr) +#endif // ENABLE_XAPIAN +{} + +int SuggestionResultSet::size() const +{ +#if defined(ENABLE_XAPIAN) + if (! mp_entryRange) { + return mp_mset->size(); + } +#endif // ENABLE_XAPIAN + + return mp_entryRange->size(); +} + +SuggestionResultSet::iterator SuggestionResultSet::begin() const +{ +#if defined(ENABLE_XAPIAN) + if ( ! mp_entryRange ) { + return new iterator::SuggestionInternalData(mp_internalDb, mp_mset, mp_mset->begin()); + } +#endif // ENABLE_XAPIAN + + return iterator(mp_entryRange->begin()); +} + +SuggestionResultSet::iterator SuggestionResultSet::end() const +{ +#if defined(ENABLE_XAPIAN) + if ( ! mp_entryRange ) { + return new iterator::SuggestionInternalData(mp_internalDb, mp_mset, mp_mset->end()); + } +#endif // ENABLE_XAPIAN + + return iterator(mp_entryRange->end()); +} + +} // namespace zim diff --git a/src/suggestion_internal.h b/src/suggestion_internal.h new file mode 100644 index 0000000..fa338fc --- /dev/null +++ b/src/suggestion_internal.h @@ -0,0 +1,150 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * Copyright (C) 2021 Maneesh P M + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_SUGGESTION_INTERNAL_H +#define ZIM_SUGGESTION_INTERNAL_H + +#include "zim/suggestion.h" +#include "zim/archive.h" + +#include +#include + +#if defined(LIBZIM_WITH_XAPIAN) +#include +#endif + +namespace zim +{ + +/** + * A class to encapsulate a xapian title index and it's archive and all the + * information we can gather from it. + */ +class SuggestionDataBase { + public: // methods + SuggestionDataBase(const Archive& archive, bool verbose); + + public: // data + // The archive to get suggestions from. + Archive m_archive; + + // Verbosity of operations. + bool m_verbose; + + private: // data + std::mutex m_mutex; + +#if defined(LIBZIM_WITH_XAPIAN) + + public: // xapian based methods + bool hasDatabase() const; + bool hasValuesmap() const; + bool hasValue(const std::string& valueName) const; + int valueSlot(const std::string& valueName) const; + + Xapian::Query parseQuery(const std::string& query); + + public: // xapian based data + // The Xapian database we will search on. + Xapian::Database m_database; + + // The valuesmap associated with the database. + std::map m_valuesmap; + + // The query parser corresponding to the database. + Xapian::QueryParser m_queryParser; + + // The stemmer used to parse queries + Xapian::Stem m_stemmer; + + private: + void initXapianDb(); +#endif // LIBZIM_WITH_XAPIAN +}; + +#if defined(LIBZIM_WITH_XAPIAN) +struct SuggestionIterator::SuggestionInternalData { + std::shared_ptr mp_internalDb; + std::shared_ptr mp_mset; + Xapian::MSetIterator iterator; + Xapian::Document _document; + bool document_fetched; + std::unique_ptr _entry; + + SuggestionInternalData(const SuggestionInternalData& other) : + mp_internalDb(other.mp_internalDb), + mp_mset(other.mp_mset), + iterator(other.iterator), + _document(other._document), + document_fetched(other.document_fetched), + _entry(other._entry ? new Entry(*other._entry) : nullptr ) + { + } + + SuggestionInternalData& operator=(const SuggestionInternalData& other) + { + if (this != &other) { + mp_internalDb = other.mp_internalDb; + mp_mset = other.mp_mset; + iterator = other.iterator; + _document = other._document; + document_fetched = other.document_fetched; + _entry.reset(other._entry ? new Entry(*other._entry) : nullptr); + } + return *this; + } + + SuggestionInternalData(std::shared_ptr p_internalDb, std::shared_ptr p_mset, Xapian::MSetIterator iterator) : + mp_internalDb(p_internalDb), + mp_mset(p_mset), + iterator(iterator), + document_fetched(false) + {}; + + Xapian::Document get_document() { + if ( !document_fetched ) { + if (iterator == mp_mset->end()) { + throw std::runtime_error("Cannot get entry for end iterator"); + } + _document = iterator.get_document(); + document_fetched = true; + } + return _document; + } + + Entry& get_entry() { + if (!_entry) { + _entry.reset(new Entry(mp_internalDb->m_archive.getEntryByPath(get_document().get_data()))); + } + return *_entry.get(); + } + + bool operator==(const SuggestionInternalData& other) const { + return (mp_internalDb == other.mp_internalDb + && mp_mset == other.mp_mset + && iterator == other.iterator); + } +}; +#endif // LIBZIM_WITH_XAPIAN + +} + +#endif // ZIM_SUGGESTION_INTERNAL_H diff --git a/src/suggestion_iterator.cpp b/src/suggestion_iterator.cpp new file mode 100644 index 0000000..4a2be50 --- /dev/null +++ b/src/suggestion_iterator.cpp @@ -0,0 +1,238 @@ +/* + * Copyright (C) 2021 Maneesh P M + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#define ZIM_PRIVATE + +#include "zim/suggestion_iterator.h" +#include "suggestion_internal.h" +#include + +namespace zim +{ + +SuggestionIterator::~SuggestionIterator() = default; +SuggestionIterator::SuggestionIterator(SuggestionIterator&& it) = default; +SuggestionIterator& SuggestionIterator::operator=(SuggestionIterator&& it) = default; + +SuggestionIterator::SuggestionIterator(RangeIterator rangeIterator) + : mp_rangeIterator(std::unique_ptr(new RangeIterator(rangeIterator))) +#if defined(LIBZIM_WITH_XAPIAN) + , mp_internal(nullptr) +#endif // LIBZIM_WITH_XAPIAN +{} + +#if defined(LIBZIM_WITH_XAPIAN) +SuggestionIterator::SuggestionIterator(SuggestionInternalData* internal) + : mp_rangeIterator(nullptr), + mp_internal(internal) +{} +#endif // LIBZIM_WITH_XAPIAN + +SuggestionIterator::SuggestionIterator(const SuggestionIterator& it) + : mp_rangeIterator(nullptr) +{ +#if defined(LIBZIM_WITH_XAPIAN) + mp_internal.reset(nullptr); + if (it.mp_internal) { + mp_internal = std::unique_ptr(new SuggestionInternalData(*it.mp_internal)); + } +#endif // LIBZIM_WITH_XAPIAN + + if (it.mp_rangeIterator) { + mp_rangeIterator = std::unique_ptr(new RangeIterator(*it.mp_rangeIterator)); + } +} + +SuggestionIterator& SuggestionIterator::operator=(const SuggestionIterator& it) { + mp_rangeIterator.reset(); + if (it.mp_rangeIterator) { + mp_rangeIterator.reset(new RangeIterator(*it.mp_rangeIterator)); + } + +#if defined(LIBZIM_WITH_XAPIAN) + mp_internal.reset(); + if (it.mp_internal) { + mp_internal.reset(new SuggestionInternalData(*it.mp_internal)); + } +#endif // LIBZIM_WITH_XAPIAN + + m_suggestionItem.reset(); + return *this; +} + +bool SuggestionIterator::operator==(const SuggestionIterator& it) const { + if (mp_rangeIterator && it.mp_rangeIterator) { + return (*mp_rangeIterator == *it.mp_rangeIterator); + } + +#if defined(LIBZIM_WITH_XAPIAN) + if (mp_internal && it.mp_internal) { + return (*mp_internal == *it.mp_internal); + } +#endif // LIBZIM_WITH_XAPIAN + + return false; +} + +bool SuggestionIterator::operator!=(const SuggestionIterator& it) const { + return ! (*this == it); +} + +SuggestionIterator& SuggestionIterator::operator++() { +#if defined(LIBZIM_WITH_XAPIAN) + if (mp_internal) { + ++(mp_internal->iterator); + mp_internal->_entry.reset(); + mp_internal->document_fetched = false; + } +#endif // LIBZIM_WITH_XAPIAN + + if (mp_rangeIterator) { + ++(*mp_rangeIterator); + } + m_suggestionItem.reset(); + return *this; +} + +SuggestionIterator SuggestionIterator::operator++(int) { + SuggestionIterator it = *this; + operator++(); + return it; +} + +SuggestionIterator& SuggestionIterator::operator--() { +#if defined(LIBZIM_WITH_XAPIAN) + if (mp_internal) { + --(mp_internal->iterator); + mp_internal->_entry.reset(); + mp_internal->document_fetched = false; + } +#endif // LIBZIM_WITH_XAPIAN + + if (mp_rangeIterator) { + --(*mp_rangeIterator); + } + m_suggestionItem.reset(); + return *this; +} + +SuggestionIterator SuggestionIterator::operator--(int) { + SuggestionIterator it = *this; + operator--(); + return it; +} + +Entry SuggestionIterator::getEntry() const { +#if defined(LIBZIM_WITH_XAPIAN) + if (mp_internal) { + return mp_internal->get_entry(); + } +#endif // LIBZIM_WITH_XAPIAN + + if (mp_rangeIterator) { + return **mp_rangeIterator; + } + throw std::runtime_error("Cannot dereference iterator"); +} + +#if defined(LIBZIM_WITH_XAPIAN) +std::string SuggestionIterator::getDbData() const { + if (! mp_internal) { + return ""; + } + + return mp_internal->get_document().get_data(); +} + +std::string SuggestionIterator::getIndexPath() const +{ + if (! mp_internal) { + return ""; + } + + std::string path = mp_internal->get_document().get_data(); + bool hasNewNamespaceScheme = mp_internal->mp_internalDb->m_archive.hasNewNamespaceScheme(); + + std::string dbDataType = mp_internal->mp_internalDb->m_database.get_metadata("data"); + if (dbDataType.empty()) { + dbDataType = "fullPath"; + } + + // If the archive has new namespace scheme and the type of its indexed data + // is `fullPath` we return only the `path` without namespace + if (hasNewNamespaceScheme && dbDataType == "fullPath") { + path = path.substr(2); + } + return path; +} + +std::string SuggestionIterator::getIndexTitle() const { + if ( ! mp_internal) { + return ""; + } + try { + return mp_internal->get_entry().getTitle(); + } catch (...) { + return ""; + } +} + +std::string SuggestionIterator::getIndexSnippet() const { + if (! mp_internal) { + return ""; + } + + try { + return mp_internal->mp_mset->snippet(getIndexTitle(), 500, mp_internal->mp_internalDb->m_stemmer); + } catch(...) { + return ""; + } +} +#endif // LIBZIM_WITH_XAPIAN + +const SuggestionItem& SuggestionIterator::operator*() { + if (m_suggestionItem) { + return *m_suggestionItem; + } + +#if defined(LIBZIM_WITH_XAPIAN) + if (mp_internal) { + m_suggestionItem.reset(new SuggestionItem(getIndexTitle(), + getIndexPath(), getIndexSnippet())); + } else +#endif // LIBZIM_WITH_XAPIAN + + if (mp_rangeIterator) { + m_suggestionItem.reset(new SuggestionItem((*mp_rangeIterator)->getTitle(), + (*mp_rangeIterator)->getPath())); + } + + if (!m_suggestionItem){ + throw std::runtime_error("Cannot dereference iterator"); + } + + return *m_suggestionItem.get(); +} + +const SuggestionItem* SuggestionIterator::operator->() { + operator*(); + return m_suggestionItem.get(); +} + +} // namespace zim diff --git a/src/template.cpp b/src/template.cpp new file mode 100644 index 0000000..75e4bb8 --- /dev/null +++ b/src/template.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "template.h" + +namespace zim +{ + void TemplateParser::state_data(char ch) + { + data += ch; + + if (ch == '<') + { + state = &TemplateParser::state_lt; + save = data.size() - 1; + } + } + + void TemplateParser::state_lt(char ch) + { + data += ch; + + if (ch == '%') + state = &TemplateParser::state_token0; + else + state = &TemplateParser::state_data; + } + + void TemplateParser::state_token0(char ch) + { + data += ch; + + if (ch == '/') + state = &TemplateParser::state_link0; + else + { + token = data.size() - 1; + state = &TemplateParser::state_token; + } + } + + void TemplateParser::state_token(char ch) + { + data += ch; + + if (ch == '%') + state = &TemplateParser::state_token_end; + } + + void TemplateParser::state_token_end(char ch) + { + if (ch == '>') + { + if (event) + { + event->onData(data.substr(0, save)); + event->onToken(data.substr(token, data.size() - token - 1)); + data.clear(); + } + + state = &TemplateParser::state_data; + } + else + { + data += ch; + state = &TemplateParser::state_data; + } + } + + void TemplateParser::state_link0(char ch) + { + data += ch; + + ns = ch; + state = &TemplateParser::state_link; + } + + void TemplateParser::state_link(char ch) + { + data += ch; + + if (ch == '/') + { + token = data.size(); + state = &TemplateParser::state_title; + } + else + state = &TemplateParser::state_data; + } + + void TemplateParser::state_title(char ch) + { + data += ch; + + if (ch == '%') + { + token_e = data.size() - 1; + state = &TemplateParser::state_title_end; + } + } + + void TemplateParser::state_title_end(char ch) + { + data += ch; + + if (ch == '>') + { + if (event) + { + event->onData(data.substr(0, save)); + event->onLink(ns, data.substr(token, token_e - token)); + } + + data.clear(); + state = &TemplateParser::state_data; + } + } + + void TemplateParser::flush() + { + if (event) + event->onData(data); + data.clear(); + state = &TemplateParser::state_data; + } +} diff --git a/src/template.h b/src/template.h new file mode 100644 index 0000000..116be10 --- /dev/null +++ b/src/template.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_TEMPLATE_H +#define ZIM_TEMPLATE_H + +#include + +namespace zim +{ + class TemplateParser + { + public: + class Event + { + public: + virtual void onData(const std::string& data) = 0; + virtual void onToken(const std::string& token) = 0; + virtual void onLink(char ns, const std::string& url) = 0; + virtual ~Event() = default; + }; + + private: + Event* event; + + std::string data; + std::string::size_type save; + std::string::size_type token; + std::string::size_type token_e; + char ns; + typedef void (TemplateParser::*state_type)(char); + + state_type state; + + void state_data(char ch); + void state_lt(char ch); + void state_token0(char ch); + void state_token(char ch); + void state_token_end(char ch); + void state_link0(char ch); + void state_link(char ch); + void state_title(char ch); + void state_title_end(char ch); + + public: + explicit TemplateParser(Event* ev) + : event(ev), + state(&TemplateParser::state_data) + { } + + void parse(char ch) + { + (this->*state)(ch); + } + + void parse(const std::string& s) + { + for (std::string::const_iterator ch = s.begin(); ch != s.end(); ++ch) + parse(*ch); + } + + void flush(); + }; +} + +#endif // ZIM_TEMPLATE_H diff --git a/src/tools.cpp b/src/tools.cpp new file mode 100644 index 0000000..6142b2e --- /dev/null +++ b/src/tools.cpp @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2016-2021 Matthieu Gautier + * Copyright (C) 2021 Maneeshs P M + * Copyright (C) 2013-2016 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "tools.h" +#include "fs.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +# include +# include +# include +# include +# define SEPARATOR "\\" +#else +# include +# define SEPARATOR "/" +#endif + +#ifdef __MINGW32__ +# include +#else +# include +# include +#endif + +bool zim::isCompressibleMimetype(const std::string& mimetype) +{ + return mimetype.find("text") == 0 + || mimetype.find("+xml") != std::string::npos + || mimetype.find("+json") != std::string::npos + || mimetype == "application/javascript" + || mimetype == "application/json"; +} + +uint32_t zim::countWords(const std::string& text) +{ + unsigned int numWords = 0; + unsigned int length = text.size(); + unsigned int i = 0; + + // Find first word + while ( i < length && std::isspace(text[i]) ) i++; + + while ( i < length ) { + // Find end of word + while ( i < length && !std::isspace(text[i]) ) i++; + numWords++; + // Find start of next word + while ( i < length && std::isspace(text[i]) ) i++; + } + return numWords; +} + + +void zim::microsleep(int microseconds) { +#ifdef __MINGW32__ + struct timespec wait = {0, 0}; + wait.tv_sec = microseconds / 1000000; + wait.tv_nsec = (microseconds - wait.tv_sec*10000) * 1000; + nanosleep(&wait, nullptr); +#else + std::this_thread::sleep_for(std::chrono::microseconds(microseconds)); +#endif +} + + +std::tuple zim::parseLongPath(const std::string& longPath) +{ + /* Index of the namespace char; discard '/' from absolute paths */ + const unsigned int i = (longPath[0] == '/') ? 1 : 0; + if (i + 1 > longPath.size() || longPath[i] == '/' || (i + 1 < longPath.size() && longPath[i+1] != '/')) + throw std::runtime_error("Cannot parse path"); + + auto ns = longPath[i]; + auto shortPath = longPath.substr(std::min(i+2, (unsigned int)longPath.size())); + + return std::make_tuple(ns, shortPath); +} + +unsigned int zim::parseIllustrationPathToSize(const std::string& s) +{ + int nw(0), nh(0), nEnd(0); + long int w(-1), h(-1); + if ( sscanf(s.c_str(), "Illustration_%n%ldx%n%ld@1%n)", &nw, &w, &nh, &h, &nEnd) == 2 + && (size_t)nEnd == s.size() && !isspace(s[nw]) && !isspace(s[nh]) && w == h && w >= 0) { + return (unsigned int)w; + } + throw std::runtime_error(""); +} + +uint32_t zim::randomNumber(uint32_t max) +{ + static std::default_random_engine random( + std::chrono::system_clock::now().time_since_epoch().count()); + static std::mutex mutex; + + std::lock_guard l(mutex); + return ((double)random() / random.max()) * max; +} + +/* Split string in a token array */ +std::vector zim::split(const std::string & str, + const std::string & delims) +{ + std::string::size_type lastPos = str.find_first_not_of(delims, 0); + std::string::size_type pos = str.find_first_of(delims, lastPos); + std::vector tokens; + + while (std::string::npos != pos || std::string::npos != lastPos) + { + tokens.push_back(str.substr(lastPos, pos - lastPos)); + lastPos = str.find_first_not_of(delims, pos); + pos = str.find_first_of(delims, lastPos); + } + + return tokens; +} + +std::map zim::read_valuesmap(const std::string &s) { + std::map result; + std::vector elems = split(s, ";"); + for(std::vector::iterator elem = elems.begin(); + elem != elems.end(); + elem++) + { + std::vector tmp_elems = split(*elem, ":"); + result.insert( std::pair(tmp_elems[0], atoi(tmp_elems[1].c_str())) ); + } + return result; +} + +// Xapian based tools +#if defined(ENABLE_XAPIAN) + +#include "xapian.h" + +#include +#include +#include +std::string zim::removeAccents(const std::string& text) +{ + ucnv_setDefaultName("UTF-8"); + static UErrorCode status = U_ZERO_ERROR; + static std::unique_ptr removeAccentsTrans(icu::Transliterator::createInstance( + "Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status)); + icu::UnicodeString ustring(text.c_str()); + removeAccentsTrans->transliterate(ustring); + std::string unaccentedText; + ustring.toUTF8String(unaccentedText); + return unaccentedText; +} + +bool zim::getDbFromAccessInfo(zim::Item::DirectAccessInfo accessInfo, Xapian::Database& database) { + zim::DEFAULTFS::FD databasefd; + try { + databasefd = zim::DEFAULTFS::openFile(accessInfo.first); + } catch (...) { + std::cerr << "Impossible to open " << accessInfo.first << std::endl; + std::cerr << strerror(errno) << std::endl; + return false; + } + if (!databasefd.seek(zim::offset_t(accessInfo.second))) { + std::cerr << "Something went wrong seeking databasedb " + << accessInfo.first << std::endl; + std::cerr << "dbOffest = " << accessInfo.second << std::endl; + return false; + } + + try { + database = Xapian::Database(databasefd.release()); + } catch( Xapian::DatabaseError& e) { + std::cerr << "Something went wrong opening xapian database for zimfile " + << accessInfo.first << std::endl; + std::cerr << "dbOffest = " << accessInfo.second << std::endl; + std::cerr << "error = " << e.get_msg() << std::endl; + return false; + } + + return true; +} + +void setICUDataDirectory(const std::string& path) +{ + u_setDataDirectory(path.c_str()); +} +#endif diff --git a/src/tools.h b/src/tools.h new file mode 100644 index 0000000..a42d4fd --- /dev/null +++ b/src/tools.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2016-2020 Matthieu Gautier + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2013-2016 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_TOOLS_H +#define OPENZIM_LIBZIM_TOOLS_H + +#include +#include +#include +#include +#include "config.h" + +#include + +#if defined(ENABLE_XAPIAN) +namespace Xapian { + class Database; +} +#endif // ENABLE_XAPIAN +namespace zim { + bool isCompressibleMimetype(const std::string& mimetype); + uint32_t countWords(const std::string& text); + void microsleep(int microseconds); + + std::tuple parseLongPath(const std::string& longPath); + + // Parse a illustration path ("Illustration_x@1") to a size. + unsigned int parseIllustrationPathToSize(const std::string& s); + + /** Return a random number from range [0, max] + * + * This function is threadsafe + **/ + uint32_t randomNumber(uint32_t max); + + std::vector split(const std::string & str, + const std::string & delims=" *-"); + + std::map read_valuesmap(const std::string& s); + +// Xapian based tools +#if defined(ENABLE_XAPIAN) + std::string removeAccents(const std::string& text); + bool getDbFromAccessInfo(Item::DirectAccessInfo accessInfo, Xapian::Database& database); +#endif +} + +#endif // OPENZIM_LIBZIM_TOOLS_H diff --git a/src/uuid.cpp b/src/uuid.cpp new file mode 100644 index 0000000..950db38 --- /dev/null +++ b/src/uuid.cpp @@ -0,0 +1,112 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2018-2020 Matthieu Gautier + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include +#include +#include // necessary to have the new types +#include "log.h" +#include "md5.h" + +#ifdef _WIN32 + +# include +# include +int gettimeofday(struct timeval* tp, void* tzp) { + DWORD t; + t = timeGetTime(); + tp->tv_sec = t / 1000; + tp->tv_usec = t % 1000; + return 0; +} + +#define getpid GetCurrentProcessId + +#else +# include +#endif + +log_define("zim.uuid") + +namespace zim +{ + namespace + { + char hex[] = "0123456789abcdef"; + inline char hi(char v) + { return hex[(v >> 4) & 0xf]; } + + inline char lo(char v) + { return hex[v & 0xf]; } + } + + Uuid Uuid::generate(std::string value) + { + Uuid ret; + struct zim_MD5_CTX md5ctx; + zim_MD5Init(&md5ctx); + + if ( value.empty() ) { + struct timeval tv; + gettimeofday(&tv, 0); + + clock_t c = clock(); + + zim_MD5Update(&md5ctx, reinterpret_cast(&c), sizeof(clock_t)); + zim_MD5Update(&md5ctx, reinterpret_cast(&tv), sizeof(struct timeval)); + } else { + zim_MD5Update(&md5ctx, reinterpret_cast(value.data()), value.size()); + } + zim_MD5Final(reinterpret_cast(&ret.data[0]), &md5ctx); + + log_debug("generated uuid: " << ret.data); + + return ret; + } + + Uuid::operator std::string() const + { + std::ostringstream out; + zim::operator<<(out, *this); + return out.str(); + } + + std::ostream& operator<< (std::ostream& out, const Uuid& uuid) + { + for (unsigned n = 0; n < 4; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 4; n < 6; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 6; n < 8; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 8; n < 10; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + out << '-'; + for (unsigned n = 10; n < 16; ++n) + out << hi(uuid.data[n]) << lo(uuid.data[n]); + return out; + } + +} diff --git a/src/version.cpp b/src/version.cpp new file mode 100644 index 0000000..5f6b077 --- /dev/null +++ b/src/version.cpp @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2021 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include + +#include +#include +#include +#include +#include + +#if defined(ENABLE_XAPIAN) +#include +#include +#endif + +namespace zim +{ + LibVersions getVersions() { + LibVersions versions = { + { "libzim", LIBZIM_VERSION }, + { "libzstd", ZSTD_VERSION_STRING }, + { "liblzma", LZMA_VERSION_STRING } + }; + +#if defined(ENABLE_XAPIAN) + // Libxapian is not a mandatory dependence + versions.push_back({ "libxapian", XAPIAN_VERSION }); + + // U_ICU_VERSION does not include the patch level if 0 + std::ostringstream libicu_version; + libicu_version << U_ICU_VERSION_MAJOR_NUM << "." << U_ICU_VERSION_MINOR_NUM << "." << U_ICU_VERSION_PATCHLEVEL_NUM; + versions.push_back({ "libicu", libicu_version.str() }); +#endif + + return versions; + } + +void printVersions(std::ostream& out) { + LibVersions versions = getVersions(); + for (const auto& iter : versions) { + out << (iter != versions.front() ? "+ " : "") << + iter.first << " " << iter.second << std::endl; + } +} + +} //namespace zim diff --git a/src/writer/_dirent.h b/src/writer/_dirent.h new file mode 100644 index 0000000..7028013 --- /dev/null +++ b/src/writer/_dirent.h @@ -0,0 +1,247 @@ +/* + * Copyright (C) 2018-2021 Matthieu Gautier + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_DIRENT_H +#define ZIM_WRITER_DIRENT_H + +#include "cluster.h" +#include "tinyString.h" + +#include "debug.h" + +namespace zim +{ + namespace writer { + class Dirent; + + // Be sure that enum value are sorted by "alphabetical" order + enum class NS: uint8_t { + C = 0, + M = 1, + W = 2, + X = 3 + }; + + char NsAsChar(NS ns); + + class DirentInfo { + public: // structures + struct Direct { + Direct() : + cluster(nullptr), + blobNumber(0) + {}; + Cluster* cluster; + blob_index_t blobNumber; + } PACKED; + + struct Redirect { + Redirect(NS ns, const std::string& target) : + targetPath(target), + ns(ns) + {}; + Redirect(Redirect&& r) = default; + ~Redirect() {}; + TinyString targetPath; + NS ns; + } PACKED; + + struct Resolved { + Resolved(const Dirent* target) : + targetDirent(target) + {}; + const Dirent* targetDirent; + } PACKED; + + public: // functions + ~DirentInfo() { + switch(tag) { + case DIRECT: + direct.~Direct(); + break; + case REDIRECT: + redirect.~Redirect(); + break; + case RESOLVED: + resolved.~Resolved(); + break; + } + }; + DirentInfo(Direct&& d): + direct(std::move(d)), + tag(DirentInfo::DIRECT) + {} + DirentInfo(Redirect&& r): + redirect(std::move(r)), + tag(DirentInfo::REDIRECT) + {} + DirentInfo(Resolved&& r): + resolved(std::move(r)), + tag(DirentInfo::RESOLVED) + {} + DirentInfo::Direct& getDirect() { + ASSERT(tag, ==, DIRECT); + return direct; + } + DirentInfo::Redirect& getRedirect() { + ASSERT(tag, ==, REDIRECT); + return redirect; + } + DirentInfo::Resolved& getResolved() { + ASSERT(tag, ==, RESOLVED); + return resolved; + } + const DirentInfo::Direct& getDirect() const { + ASSERT(tag, ==, DIRECT); + return direct; + } + const DirentInfo::Redirect& getRedirect() const { + ASSERT(tag, ==, REDIRECT); + return redirect; + } + const DirentInfo::Resolved& getResolved() const { + ASSERT(tag, ==, RESOLVED); + return resolved; + } + + private: // members + union { + Direct direct; + Redirect redirect; + Resolved resolved; + } PACKED; + + public: // members + enum : char {DIRECT, REDIRECT, RESOLVED} tag; + } PACKED; + + class Dirent + { + static const uint16_t redirectMimeType = 0xffff; + static const uint32_t version = 0; + + PathTitleTinyString pathTitle; + uint16_t mimeType; + entry_index_t idx = entry_index_t(0); + DirentInfo info; + offset_t offset; + uint8_t _ns : 2; + bool removed : 1; + bool frontArticle : 1; + + public: + // Creator for a "classic" dirent + Dirent(NS ns, const std::string& path, const std::string& title, uint16_t mimetype); + + // Creator for a "redirection" dirent + Dirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath); + + // Creator for "temporary" dirent, used to search for dirent in container. + // We use them in url ordered container so we only need to set the namespace and the path. + // Other value are irrelevant. + Dirent(NS ns, const std::string& path) + : Dirent(ns, path, "", 0) + { } + + NS getNamespace() const { return static_cast(_ns); } + std::string getTitle() const { return pathTitle.getTitle(false); } + std::string getRealTitle() const { return pathTitle.getTitle(true); } + std::string getPath() const { return pathTitle.getPath(); } + + uint32_t getVersion() const { return version; } + + NS getRedirectNs() const; + std::string getRedirectPath() const; + void setRedirect(const Dirent* target) { + ASSERT(info.tag, ==, DirentInfo::REDIRECT); + info.~DirentInfo(); + new(&info) DirentInfo(DirentInfo::Resolved(target)); + } + entry_index_t getRedirectIndex() const { + return info.getResolved().targetDirent->getIdx(); + } + + void setIdx(entry_index_t idx_) { idx = idx_; } + entry_index_t getIdx() const { return idx; } + + + void setCluster(zim::writer::Cluster* _cluster) + { + auto& direct = info.getDirect(); + direct.cluster = _cluster; + direct.blobNumber = _cluster->count(); + } + + zim::writer::Cluster* getCluster() + { + return info.getDirect().cluster; + } + + cluster_index_t getClusterNumber() const { + auto& direct = info.getDirect(); + return direct.cluster ? direct.cluster->getClusterIndex() : cluster_index_t(0); + } + blob_index_t getBlobNumber() const { + return info.getDirect().blobNumber; + } + + bool isRedirect() const { return mimeType == redirectMimeType; } + bool isItem() const { return !isRedirect(); } + uint16_t getMimeType() const { return mimeType; } + void setMimeType(uint16_t m) { + ASSERT(info.tag, ==, DirentInfo::DIRECT); + mimeType = m; + } + size_t getDirentSize() const + { + return (isRedirect() ? 12 : 16) + pathTitle.size() + 1; + } + + offset_t getOffset() const { return offset; } + void setOffset(offset_t o) { offset = o; } + + bool isRemoved() const { return removed; } + void markRemoved() { removed = true; } + + bool isFrontArticle() const { return frontArticle; } + void setFrontArticle() { frontArticle = true; } + + void write(int out_fd) const; + + friend bool compareUrl(const Dirent* d1, const Dirent* d2); + friend inline bool compareTitle(const Dirent* d1, const Dirent* d2); + } PACKED; + + + inline bool compareUrl(const Dirent* d1, const Dirent* d2) + { + return d1->getNamespace() < d2->getNamespace() + || (d1->getNamespace() == d2->getNamespace() && d1->getPath() < d2->getPath()); + } + inline bool compareTitle(const Dirent* d1, const Dirent* d2) + { + return d1->getNamespace() < d2->getNamespace() + || (d1->getNamespace() == d2->getNamespace() && d1->getTitle() < d2->getTitle()); + } + } +} + +#endif // ZIM_WRITER_DIRENT_H + diff --git a/src/writer/cluster.cpp b/src/writer/cluster.cpp new file mode 100644 index 0000000..528f8e5 --- /dev/null +++ b/src/writer/cluster.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * Copyright (C) 2021 Veloman Yunkan + * Copyright (C) 2020 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "cluster.h" +#include "../log.h" +#include "../endian_tools.h" +#include "../debug.h" +#include "../compression.h" + +#include + +#include +#include + +#include +#include + +#ifdef _WIN32 +# include +#else +# include +# define _write(fd, addr, size) ::write((fd), (addr), (size)) +#endif + +const zim::size_type MAX_WRITE_SIZE(4UL*1024*1024*1024-1); + +namespace zim { +namespace writer { + +Cluster::Cluster(Compression compression) + : compression(compression), + isExtended(false), + _size(0) +{ + blobOffsets.push_back(offset_t(0)); +} + +Cluster::~Cluster() { + if (compressed_data.data()) { + delete[] compressed_data.data(); + } +} + +void Cluster::clear_data() { + clear_raw_data(); + clear_compressed_data(); +} + +void Cluster::clear_raw_data() { + Offsets().swap(blobOffsets); + ClusterProviders().swap(m_providers); +} + +void Cluster::clear_compressed_data() { + if (compressed_data.data()) { + delete[] compressed_data.data(); + compressed_data = Blob(); + } +} + +void Cluster::close() { + if (getCompression() != Compression::None) { + // We must compress the content in a buffer. + compress(); + clear_raw_data(); + } + closed = true; +} + +bool Cluster::isClosed() const{ + return closed; +} + +zsize_t Cluster::size() const +{ + if (isClosed()) { + throw std::runtime_error("oups"); + } + if (isExtended) { + return zsize_t(blobOffsets.size() * sizeof(uint64_t)) + _size; + } else { + return zsize_t(blobOffsets.size() * sizeof(uint32_t)) + _size; + } +} + +template +void Cluster::write_offsets(writer_t writer) const +{ + size_type delta = blobOffsets.size() * sizeof(OFFSET_TYPE); + char out_buf[sizeof(OFFSET_TYPE)]; + for (auto offset : blobOffsets) + { + offset.v += delta; + toLittleEndian(static_cast(offset.v), out_buf); + writer(Blob(out_buf, sizeof(OFFSET_TYPE))); + } +} + +void Cluster::write_content(writer_t writer) const +{ + if (isExtended) { + write_offsets(writer); + } else { + write_offsets(writer); + } + write_data(writer); +} + +void Cluster::compress() +{ + auto comp = getCompression(); + switch(comp) { + case Compression::Zstd: + { + _compress(); + break; + } + + default: + throw std::runtime_error("We cannot compress an uncompressed cluster"); + }; +} + +template +void Cluster::_compress() +{ + Compressor runner; + bool first = true; + auto writer = [&](const Blob& data) -> void { + if (first) { + runner.init((char*)data.data()); + first = false; + } + runner.feed(data.data(), data.size()); + }; + write_content(writer); + zsize_t size; + auto comp = runner.get_data(&size); + compressed_data = Blob(comp.release(), size.v); +} + +void Cluster::write(int out_fd) const +{ + // write clusterInfo + char clusterInfo = 0; + if (isExtended) { + clusterInfo = 0x10; + } + clusterInfo += static_cast(getCompression()); + if (_write(out_fd, &clusterInfo, 1) == -1) { + throw std::runtime_error("Error writing"); + } + + // Open a comprestion stream if needed + switch(getCompression()) + { + case Compression::None: + { + auto writer = [=](const Blob& data) -> void { + // Ideally we would simply have to do : + // ::write(tmp_fd, data.c_str(), data.size()); + // However, the data can be pretty big (> 4Gb), especially with test, + // And ::write fails to write data > 4Gb. So we have to chunck the write. + size_type to_write = data.size(); + const char* src = data.data(); + while (to_write) { + size_type chunk_size = std::min(MAX_WRITE_SIZE, to_write); + auto ret = _write(out_fd, src, chunk_size); + src += ret; + to_write -= ret; + } + }; + write_content(writer); + break; + } + + case Compression::Zstd: + { + log_debug("compress data"); + if (_write(out_fd, compressed_data.data(), compressed_data.size()) == -1) { + throw std::runtime_error("Error writing"); + } + break; + } + + default: + std::ostringstream msg; + msg << "invalid compression flag " << static_cast(getCompression()); + log_error(msg.str()); + throw std::runtime_error(msg.str()); + } +} + + +void Cluster::addContent(std::unique_ptr provider) +{ + auto size = provider->getSize(); + _size += size; + blobOffsets.push_back(offset_t(_size.v)); + m_count++; + isExtended |= (_size.v>UINT32_MAX); + if (size == 0) + return; + + m_providers.push_back(std::move(provider)); +} + +void Cluster::addContent(const std::string& data) +{ + auto contentProvider = std::unique_ptr(new StringProvider(data)); + addContent(std::move(contentProvider)); +} + +void Cluster::write_data(writer_t writer) const +{ + for (auto& provider: m_providers) + { + ASSERT(provider->getSize(), !=, 0U); + zim::size_type size = 0; + while(true) { + auto blob = provider->feed(); + if(blob.size() == 0) { + break; + } + size += blob.size(); + writer(blob); + } + ASSERT(size, ==, provider->getSize()); + } +} + +} // writer +} // zim diff --git a/src/writer/cluster.h b/src/writer/cluster.h new file mode 100644 index 0000000..ce6aa3a --- /dev/null +++ b/src/writer/cluster.h @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2017-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_CLUSTER_H_ +#define ZIM_WRITER_CLUSTER_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include "../zim_types.h" +#include "../debug.h" + +namespace zim { + +namespace writer { + +using writer_t = std::function; +class ContentProvider; + +class Cluster { + typedef std::vector Offsets; + typedef std::vector> ClusterProviders; + + + public: + Cluster(Compression compression); + virtual ~Cluster(); + + void setCompression(Compression c) { compression = c; } + Compression getCompression() const { return compression; } + + void addContent(std::unique_ptr provider); + void addContent(const std::string& data); + + blob_index_t count() const { return blob_index_t(m_count); } + zsize_t size() const; + offset_t getOffset() const { return offset; } + void setOffset(offset_t o) { offset = o; } + bool is_extended() const { return isExtended; } + void clear_data(); + void close(); + bool isClosed() const; + + void setClusterIndex(cluster_index_t idx) { index = idx; } + cluster_index_t getClusterIndex() const { return index; } + + zsize_t getBlobSize(blob_index_t n) const + { return zsize_t(blobOffsets[blob_index_type(n)+1].v - blobOffsets[blob_index_type(n)].v); } + + offset_t getBlobOffset(blob_index_t n) const { return blobOffsets[n.v]; } + offset_t getDataOffset() const { + ASSERT(bool(closed), ==, true); + return offset_t(1) + offset_t((count().v + 1) * (isExtended?sizeof(uint64_t):sizeof(uint32_t))); + } + + void write(int out_fd) const; + + protected: + Compression compression; + cluster_index_t index; + bool isExtended; + Offsets blobOffsets; + offset_t offset; + zsize_t _size; + ClusterProviders m_providers; + mutable Blob compressed_data; + std::string tmp_filename; + std::atomic closed { false }; + blob_index_type m_count { 0 }; + + private: + void write_content(writer_t writer) const; + template + void write_offsets(writer_t writer) const; + void write_data(writer_t writer) const; + void compress(); + template + void _compress(); + void clear_raw_data(); + void clear_compressed_data(); +}; + +}; + +}; + + +#endif //ZIM_WRITER_CLUSTER_H_ diff --git a/src/writer/clusterWorker.cpp b/src/writer/clusterWorker.cpp new file mode 100644 index 0000000..f820bcd --- /dev/null +++ b/src/writer/clusterWorker.cpp @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "clusterWorker.h" + +#include "cluster.h" + +std::atomic zim::writer::ClusterTask::waiting_task(0); + +namespace zim +{ + namespace writer + { + + void ClusterTask::run(CreatorData* data) { + cluster->close(); + }; + + } +} diff --git a/src/writer/clusterWorker.h b/src/writer/clusterWorker.h new file mode 100644 index 0000000..66e0dcc --- /dev/null +++ b/src/writer/clusterWorker.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_CLUSTER_WORKER_H +#define OPENZIM_LIBZIM_CLUSTER_WORKER_H + +#include +#include "workers.h" + +namespace zim { +namespace writer { + +class Cluster; + +class ClusterTask : public Task { + public: + ClusterTask(const ClusterTask&) = delete; + ClusterTask& operator=(const ClusterTask&) = delete; + explicit ClusterTask(Cluster* cluster) : + cluster(cluster) + { + ++waiting_task; + }; + virtual ~ClusterTask() + { + --waiting_task; + } + + virtual void run(CreatorData* data); + static std::atomic waiting_task; + + private: + Cluster* cluster; +}; + +} +} + +#endif // OPENZIM_LIBZIM_QUEUE_H diff --git a/src/writer/contentProvider.cpp b/src/writer/contentProvider.cpp new file mode 100644 index 0000000..0896d90 --- /dev/null +++ b/src/writer/contentProvider.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include + +#include "../fs.h" + +const zim::size_type BUFFER_SIZE(1024*1024); + +namespace zim +{ + namespace writer + { + Blob StringProvider::feed() + { + if (feeded) { + return Blob(nullptr, 0); + } + feeded = true; + return Blob(content.data(), content.size()); + } + + Blob SharedStringProvider::feed() + { + if (feeded) { + return Blob(nullptr, 0); + } + feeded = true; + return Blob(content->data(), content->size()); + } + + FileProvider::FileProvider(const std::string& filepath) + : filepath(filepath), + buffer(new char[BUFFER_SIZE]), + fd(new DEFAULTFS::FD(DEFAULTFS::openFile(filepath))), + offset(0) + { + size = fd->getSize().v; + } + + FileProvider::~FileProvider() = default; + + Blob FileProvider::feed() + { + auto sizeToRead = std::min(BUFFER_SIZE, size-offset); + if (!sizeToRead) { + return Blob(nullptr, 0); + } + + if(fd->readAt(buffer.get(), zim::zsize_t(sizeToRead), zim::offset_t(offset)).v == -1UL) { + throw std::runtime_error("Error reading file " + filepath); + } + offset += sizeToRead; + return Blob(buffer.get(), sizeToRead); + } + } +} diff --git a/src/writer/counterHandler.cpp b/src/writer/counterHandler.cpp new file mode 100644 index 0000000..b29df05 --- /dev/null +++ b/src/writer/counterHandler.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "counterHandler.h" +#include "creatordata.h" + +#include +#include + +using namespace zim::writer; + +CounterHandler::CounterHandler(CreatorData* data) + : mp_creatorData(data) +{} + +CounterHandler::~CounterHandler() = default; + +void CounterHandler::start() { +} + +void CounterHandler::stop() { +} + +DirentHandler::Dirents CounterHandler::createDirents() const { + Dirents ret; + ret.push_back(mp_creatorData->createDirent(NS::M, "Counter", "text/plain", "")); + return ret; +} + +DirentHandler::ContentProviders CounterHandler::getContentProviders() const { + ContentProviders ret; + std::stringstream ss; + bool first = true; + for(auto pair: m_mimetypeCounter) { + if (! first) { + ss << ";"; + } + ss << pair.first << "=" << pair.second; + first = false; + } + ret.push_back(std::unique_ptr(new StringProvider(ss.str()))); + return ret; +} + +void CounterHandler::handle(Dirent* dirent, const Hints& hints) +{ +} + +void CounterHandler::handle(Dirent* dirent, std::shared_ptr item) +{ + if (dirent->getNamespace() != NS::C) { + return; + } + auto mimetype = item->getMimeType(); + if (mimetype.empty()) { + return; + } + m_mimetypeCounter[mimetype] += 1; +} diff --git a/src/writer/counterHandler.h b/src/writer/counterHandler.h new file mode 100644 index 0000000..17eb48c --- /dev/null +++ b/src/writer/counterHandler.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_COUNTER_HANDLER_H +#define OPENZIM_LIBZIM_COUNTER_HANDLER_H + +#include "handler.h" + +#include + +namespace zim { +namespace writer { + + +class CounterHandler : public DirentHandler { + public: + typedef std::map Counter; + + explicit CounterHandler(CreatorData* data); + virtual ~CounterHandler(); + + void start() override; + void stop() override; + bool isCompressible() override { return true; } + ContentProviders getContentProviders() const override; + void handle(Dirent* dirent, std::shared_ptr item) override; + void handle(Dirent* dirent, const Hints& hints) override; + + private: + Dirents createDirents() const override; + CreatorData* mp_creatorData; + Counter m_mimetypeCounter; +}; + +} +} + +#endif // OPENZIM_LIBZIM_COUNTER_HANDLER_H diff --git a/src/writer/creator.cpp b/src/writer/creator.cpp new file mode 100644 index 0000000..6eed73b --- /dev/null +++ b/src/writer/creator.cpp @@ -0,0 +1,681 @@ +/* + * Copyright (C) 2019-2021 Matthieu Gautier + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2021 Veloman Yunkan + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include + +#include "config.h" + +#include "creatordata.h" +#include "cluster.h" +#include "debug.h" +#include "workers.h" +#include "clusterWorker.h" +#include +#include +#include "../endian_tools.h" +#include +#include +#include "../md5.h" +#include "../constants.h" +#include "counterHandler.h" + +#if defined(ENABLE_XAPIAN) +# include "xapianHandler.h" +#endif + +#ifdef _WIN32 +# include +# include +#else +# include +# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \ +{throw std::runtime_error("Error writing");} +#endif + +#include +#include +#include +#include +#include +#include +#include +#include "log.h" +#include "../fs.h" +#include "../tools.h" + +log_define("zim.writer.creator") + +#define INFO(e) \ + do { \ + log_info(e); \ + std::cout << e << std::endl; \ + } while(false) + +#define TINFO(e) \ + if (m_verbose) { \ + double seconds = difftime(time(NULL), data->start_time); \ + std::cout << "T:" << (int)(seconds) \ + << "; " << e << std::endl; \ + } + +#define TPROGRESS() \ + if (m_verbose ) { \ + double seconds = difftime(time(NULL),data->start_time); \ + std::cout << "T:" << (int)seconds \ + << "; A:" << data->dirents.size() \ + << "; RA:" << data->nbRedirectItems \ + << "; CA:" << data->nbCompItems \ + << "; UA:" << data->nbUnCompItems \ + << "; C:" << data->nbClusters \ + << "; CC:" << data->nbCompClusters \ + << "; UC:" << data->nbUnCompClusters \ + << "; WC:" << data->taskList.size() \ + << std::endl; \ + } + + +#define CLUSTER_BASE_OFFSET 2048 + +namespace zim +{ + namespace writer + { + Creator::Creator() + : m_clusterSize(DEFAULT_CLUSTER_SIZE) + {} + Creator::~Creator() = default; + + Creator& Creator::configVerbose(bool verbose) + { + m_verbose = verbose; + return *this; + } + + Creator& Creator::configCompression(Compression compression) + { + m_compression = compression; + return *this; + } + + Creator& Creator::configClusterSize(zim::size_type targetSize) + { + m_clusterSize = targetSize; + return *this; + } + + Creator& Creator::configIndexing(bool indexing, const std::string& language) + { + m_withIndex = indexing; + m_indexingLanguage = language; + return *this; + } + + Creator& Creator::configNbWorkers(unsigned nbWorkers) + { + m_nbWorkers = nbWorkers; + return *this; + } + + void Creator::startZimCreation(const std::string& filepath) + { + data = std::unique_ptr( + new CreatorData(filepath, m_verbose, m_withIndex, m_indexingLanguage, m_compression, m_clusterSize) + ); + + for(unsigned i=0; idata.get()); + data->workerThreads.push_back(std::move(thread)); + } + + data->writerThread = std::thread(clusterWriter, this->data.get()); + } + + void Creator::addItem(std::shared_ptr item) + { + bool compressContent = item->getAmendedHints()[COMPRESS]; + auto dirent = data->createItemDirent(item.get()); + data->addItemData(dirent, item->getContentProvider(), compressContent); + data->handle(dirent, item); + + if (data->dirents.size()%1000 == 0) { + TPROGRESS(); + } + } + + void Creator::addMetadata(const std::string& name, const std::string& content, const std::string& mimetype) + { + auto provider = std::unique_ptr(new StringProvider(content)); + addMetadata(name, std::move(provider), mimetype); + } + + void Creator::addMetadata(const std::string& name, std::unique_ptr provider, const std::string& mimetype) + { + auto compressContent = isCompressibleMimetype(mimetype); + auto dirent = data->createDirent(NS::M, name, mimetype, ""); + data->addItemData(dirent, std::move(provider), compressContent); + data->handle(dirent); + } + + void Creator::addIllustration(unsigned int size, const std::string& content) + { + auto provider = std::unique_ptr(new StringProvider(content)); + addIllustration(size, std::move(provider)); + } + + void Creator::addIllustration(unsigned int size, std::unique_ptr provider) + { + std::stringstream ss; + ss << "Illustration_" << size << "x" << size << "@1"; + addMetadata(ss.str(), std::move(provider), "image/png"); + } + + void Creator::addRedirection(const std::string& path, const std::string& title, const std::string& targetPath, const Hints& hints) + { + auto dirent = data->createRedirectDirent(NS::C, path, title, NS::C, targetPath); + if (data->dirents.size()%1000 == 0){ + TPROGRESS(); + } + + data->handle(dirent, hints); + } + + void Creator::finishZimCreation() + { + // Create a redirection for the mainPage. + // We need to keep the created dirent to set the fileheader. + // Dirent doesn't have to be deleted. + if (!m_mainPath.empty()) { + data->mainPageDirent = data->createRedirectDirent(NS::W, "mainPage", "", NS::C, m_mainPath); + data->handle(data->mainPageDirent); + } + + TPROGRESS(); + + // mp_titleListingHandler is a special case, it have to handle all dirents (including itself) + for(auto& handler:data->m_direntHandlers) { + // This silently create all the needed dirents. + for(auto dirent:handler->getDirents()) { + data->mp_titleListingHandler->handle(dirent, Hints()); + } + } + + // Now we have all the dirents (but not the data), we must correctly set/fix the dirents + // before we ask data to the handlers + TINFO("ResolveRedirectIndexes"); + data->resolveRedirectIndexes(); + + TINFO("Set entry indexes"); + data->setEntryIndexes(); + + TINFO("Resolve mimetype"); + data->resolveMimeTypes(); + + // We can now stop the direntHandlers, and get their content + bool titleListDirentSeen = false; + for(auto& handler:data->m_direntHandlers) { + handler->stop(); + const auto& dirents = handler->getDirents(); + if (dirents.empty()) { + continue; + } + auto providers = handler->getContentProviders(); + ASSERT(dirents.size(), ==, providers.size()); + auto provider_it = providers.begin(); + for(auto& dirent:dirents) { + // As we use a "handler level" isCompressible, all content of the same handler + // must have the same compression. + data->addItemData(dirent, std::move(*provider_it), handler->isCompressible()); + if (handler == data->mp_titleListingHandler && !titleListDirentSeen) { + // We have to get the offset of the titleList in the cluster before + // we close the cluster. Once the cluster is close, the offset information is dropped. + // This works only if titleListingHandler create the full (V0) titlelist in its first dirent. + data->m_titleListBlobOffset = data->uncompCluster->getBlobOffset(dirent->getBlobNumber()); + titleListDirentSeen = true; + } + provider_it++; + } + } + + // All the data has been added, we can now close all clusters + if (data->compCluster->count()) + data->closeCluster(true); + + if (data->uncompCluster->count()) + data->closeCluster(false); + + TINFO("Waiting for workers"); + // wait all cluster compression has been done + unsigned int wait = 0; + do { + microsleep(wait); + wait += 10; + } while(ClusterTask::waiting_task.load() > 0); + + data->quitAllThreads(); + + // Delete all handler (they will clean there own data) + data->m_direntHandlers.clear(); + + TINFO(data->dirents.size() << " title index created"); + TINFO(data->clustersList.size() << " clusters created"); + + TINFO("write zimfile :"); + writeLastParts(); + ::close(data->out_fd); + data->out_fd = -1; + + TINFO("rename tmpfile to final one."); + DEFAULTFS::rename(data->tmpFileName, data->zimName); + data->tmpFileName.clear(); + + TINFO("finish"); + } + + void Creator::fillHeader(Fileheader* header) const + { + header->setMainPage( + data->mainPageDirent + ? entry_index_type(data->mainPageDirent->getIdx()) + : std::numeric_limits::max()); + header->setLayoutPage(std::numeric_limits::max()); + + header->setUuid( m_uuid ); + header->setArticleCount( data->dirents.size() ); + + header->setMimeListPos( Fileheader::size ); + + // We assume here that titleListingHandler create the V0 listing in its first dirent. + auto cluster = data->mp_titleListingHandler->getDirents()[0]->getCluster(); + header->setTitleIdxPos( + offset_type(cluster->getOffset() + cluster->getDataOffset() + data->m_titleListBlobOffset)); + + header->setClusterCount( data->clustersList.size() ); + } + + void Creator::writeLastParts() const + { + Fileheader header; + fillHeader(&header); + + int out_fd = data->out_fd; + + lseek(out_fd, header.getMimeListPos(), SEEK_SET); + TINFO(" write mimetype list"); + for(auto& mimeType: data->mimeTypesList) + { + _write(out_fd, mimeType.c_str(), mimeType.size()+1); + } + + _write(out_fd, "", 1); + + ASSERT(lseek(out_fd, 0, SEEK_CUR), <, CLUSTER_BASE_OFFSET); + + TINFO(" write directory entries"); + lseek(out_fd, 0, SEEK_END); + for (Dirent* dirent: data->dirents) + { + dirent->setOffset(offset_t(lseek(out_fd, 0, SEEK_CUR))); + dirent->write(out_fd); + } + + TINFO(" write url prt list"); + header.setUrlPtrPos(lseek(out_fd, 0, SEEK_CUR)); + for (auto& dirent: data->dirents) + { + char tmp_buff[sizeof(offset_type)]; + toLittleEndian(dirent->getOffset(), tmp_buff); + _write(out_fd, tmp_buff, sizeof(offset_type)); + } + + TINFO(" write cluster offset list"); + header.setClusterPtrPos(lseek(out_fd, 0, SEEK_CUR)); + for (auto cluster : data->clustersList) + { + char tmp_buff[sizeof(offset_type)]; + toLittleEndian(cluster->getOffset(), tmp_buff); + _write(out_fd, tmp_buff, sizeof(offset_type)); + } + + header.setChecksumPos(lseek(out_fd, 0, SEEK_CUR)); + + TINFO(" write header"); + lseek(out_fd, 0, SEEK_SET); + header.write(out_fd); + + TINFO(" write checksum"); + struct zim_MD5_CTX md5ctx; + unsigned char batch_read[1024+1]; + lseek(out_fd, 0, SEEK_SET); + zim_MD5Init(&md5ctx); + while (true) { + auto r = read(out_fd, batch_read, 1024); + if (r == -1) { + perror("Cannot read"); + throw std::runtime_error("oups"); + } + if (r == 0) + break; + batch_read[r] = 0; + zim_MD5Update(&md5ctx, batch_read, r); + } + unsigned char digest[16]; + zim_MD5Final(digest, &md5ctx); + _write(out_fd, reinterpret_cast(digest), 16); + } + + CreatorData::CreatorData(const std::string& fname, + bool verbose, + bool withIndex, + std::string language, + Compression c, + size_t clusterSize) + : mainPageDirent(nullptr), + compression(c), + zimName(fname), + tmpFileName(fname + ".tmp"), + clusterSize(clusterSize), + withIndex(withIndex), + indexingLanguage(language), + verbose(verbose), + nbRedirectItems(0), + nbCompItems(0), + nbUnCompItems(0), + nbClusters(0), + nbCompClusters(0), + nbUnCompClusters(0), + start_time(time(NULL)) + { +#ifdef _WIN32 + int flag = _O_RDWR | _O_CREAT | _O_TRUNC | _O_BINARY; + int mode = _S_IREAD | _S_IWRITE; +#else + int flag = O_RDWR | O_CREAT | O_TRUNC; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; +#endif + out_fd = open(tmpFileName.c_str(), flag, mode); + if (out_fd == -1){ + perror(nullptr); + std::ostringstream ss; + ss << "Cannot create file " << tmpFileName; + throw std::runtime_error(ss.str()); + } + if(lseek(out_fd, CLUSTER_BASE_OFFSET, SEEK_SET) != CLUSTER_BASE_OFFSET) { + close(out_fd); + perror(nullptr); + throw std::runtime_error("Impossible to seek in file"); + } + + // We keep both a "compressed cluster" and an "uncompressed cluster" + // because we don't know which one will fill up first. We also need + // to track the dirents currently in each, so we can fix up the + // cluster index if the other one ends up written first. + compCluster = new Cluster(compression); + uncompCluster = new Cluster(Compression::None); + +#if defined(ENABLE_XAPIAN) + auto xapianIndexer = std::make_shared(this, withIndex); + m_direntHandlers.push_back(xapianIndexer); +#endif + + mp_titleListingHandler = std::make_shared(this); + m_direntHandlers.push_back(mp_titleListingHandler); + m_direntHandlers.push_back(std::make_shared(this)); + + for(auto& handler:m_direntHandlers) { + handler->start(); + } + } + + CreatorData::~CreatorData() + { + quitAllThreads(); + if (compCluster) + delete compCluster; + if (uncompCluster) + delete uncompCluster; + for(auto& cluster: clustersList) { + delete cluster; + } + if ( out_fd != - 1 ) { + ::close(out_fd); + } + if ( ! tmpFileName.empty() ) { + DEFAULTFS::removeFile(tmpFileName); + } + } + + void CreatorData::quitAllThreads() { + // Quit all workerThreads + for (auto i=0U; i< workerThreads.size(); i++) { + taskList.pushToQueue(nullptr); + } + for(auto& thread: workerThreads) { + thread.join(); + } + workerThreads.clear(); + + // Wait for writerThread to finish. + if (writerThread.joinable()) { + clusterToWrite.pushToQueue(nullptr); + writerThread.join(); + } + } + + void CreatorData::addDirent(Dirent* dirent) + { + auto ret = dirents.insert(dirent); + if (!ret.second) { + Dirent* existing = *ret.first; + if (existing->isRedirect() && !dirent->isRedirect()) { + unresolvedRedirectDirents.erase(existing); + dirents.erase(ret.first); + existing->markRemoved(); + dirents.insert(dirent); + } else { + std::ostringstream ss; + ss << "Impossible to add " << NsAsChar(dirent->getNamespace()) << "/" << dirent->getPath() << std::endl; + ss << " dirent's title to add is : " << dirent->getTitle() << std::endl; + ss << " existing dirent's title is : " << existing->getTitle() << std::endl; + throw std::runtime_error(ss.str()); + } + }; + + if (dirent->isRedirect()) { + unresolvedRedirectDirents.insert(dirent); + nbRedirectItems++; + } + } + + void CreatorData::addItemData(Dirent* dirent, std::unique_ptr provider, bool compressContent) + { + // Add blob data to compressed or uncompressed cluster. + auto itemSize = provider->getSize(); + if (itemSize > 0) + { + isEmpty = false; + } + + auto cluster = compressContent ? compCluster : uncompCluster; + + // If cluster will be too large, write it to dis, and open a new + // one for the content. + if ( cluster->count() + && cluster->size().v+itemSize >= clusterSize + ) + { + log_info("cluster with " << cluster->count() << " items, " << + cluster->size() << " bytes; current title \"" << + dirent->getTitle() << '\"'); + cluster = closeCluster(compressContent); + } + + dirent->setCluster(cluster); + cluster->addContent(std::move(provider)); + + if (compressContent) { + nbCompItems++; + } else { + nbUnCompItems++; + } + } + + Dirent* CreatorData::createDirent(NS ns, const std::string& path, const std::string& mimetype, const std::string& title) + { + auto dirent = pool.getClassicDirent(ns, path, title, getMimeTypeIdx(mimetype)); + addDirent(dirent); + return dirent; + } + + Dirent* CreatorData::createItemDirent(const Item* item) + { + auto path = item->getPath(); + auto mimetype = item->getMimeType(); + if (mimetype.empty()) { + std::cerr << "Warning, " << item->getPath() << " have empty mimetype." << std::endl; + mimetype = "application/octet-stream"; + } + return createDirent(NS::C, item->getPath(), mimetype, item->getTitle()); + } + + Dirent* CreatorData::createRedirectDirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath) + { + auto dirent = pool.getRedirectDirent(ns, path, title, targetNs, targetPath); + addDirent(dirent); + return dirent; + } + + Cluster* CreatorData::closeCluster(bool compressed) + { + Cluster *cluster; + nbClusters++; + if (compressed ) + { + cluster = compCluster; + nbCompClusters++; + } else { + cluster = uncompCluster; + nbUnCompClusters++; + } + cluster->setClusterIndex(cluster_index_t(clustersList.size())); + clustersList.push_back(cluster); + taskList.pushToQueue(new ClusterTask(cluster)); + clusterToWrite.pushToQueue(cluster); + + if (compressed) + { + cluster = compCluster = new Cluster(compression); + } else { + cluster = uncompCluster = new Cluster(Compression::None); + } + return cluster; + } + + void CreatorData::setEntryIndexes() + { + // set index + INFO("set index"); + entry_index_t idx(0); + for (auto& dirent: dirents) { + dirent->setIdx(idx); + idx += 1; + } + } + + void CreatorData::resolveRedirectIndexes() + { + // translate redirect aid to index + INFO("Resolve redirect"); + for (auto dirent: unresolvedRedirectDirents) + { + Dirent tmpDirent(dirent->getRedirectNs(), dirent->getRedirectPath()); + auto target_pos = dirents.find(&tmpDirent); + if(target_pos == dirents.end()) { + INFO("Invalid redirection " + << NsAsChar(dirent->getNamespace()) << '/' << dirent->getPath() + << " redirecting to (missing) " + << NsAsChar(dirent->getRedirectNs()) << '/' << dirent->getRedirectPath()); + dirents.erase(dirent); + dirent->markRemoved(); + if (dirent == mainPageDirent) { + mainPageDirent = nullptr; + } + } else { + dirent->setRedirect(*target_pos); + } + } + } + + void CreatorData::resolveMimeTypes() + { + std::vector oldMImeList; + std::vector mapping; + + for (auto& rmimeType: rmimeTypesMap) + { + oldMImeList.push_back(rmimeType.second); + mimeTypesList.push_back(rmimeType.second); + } + + mapping.resize(oldMImeList.size()); + std::sort(mimeTypesList.begin(), mimeTypesList.end()); + + for (unsigned i=0; i(j); + } + } + + for (auto& dirent: dirents) + { + if (dirent->isItem()) + dirent->setMimeType(mapping[dirent->getMimeType()]); + } + } + + uint16_t CreatorData::getMimeTypeIdx(const std::string& mimeType) + { + auto it = mimeTypesMap.find(mimeType); + if (it == mimeTypesMap.end()) + { + if (nextMimeIdx >= std::numeric_limits::max()) + throw std::runtime_error("too many distinct mime types"); + mimeTypesMap[mimeType] = nextMimeIdx; + rmimeTypesMap[nextMimeIdx] = mimeType; + return nextMimeIdx++; + } + + return it->second; + } + + const std::string& CreatorData::getMimeType(uint16_t mimeTypeIdx) const + { + auto it = rmimeTypesMap.find(mimeTypeIdx); + if (it == rmimeTypesMap.end()) + throw std::runtime_error("mime type index not found"); + return it->second; + } + } +} diff --git a/src/writer/creatordata.h b/src/writer/creatordata.h new file mode 100644 index 0000000..a4da9ae --- /dev/null +++ b/src/writer/creatordata.h @@ -0,0 +1,153 @@ +/* + * Copyright (C) 2018-2021 Matthieu Gautier + * Copyright (C) 2021 Manessh P M + * Copyright (C) 2020 Veloman Yunkan + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_CREATOR_DATA_H +#define ZIM_WRITER_CREATOR_DATA_H + +#include +#include "queue.h" +#include "_dirent.h" +#include "workers.h" +#include "handler.h" +#include +#include +#include +#include +#include +#include "config.h" + +#include "../fileheader.h" +#include "direntPool.h" +#include "titleListingHandler.h" + +namespace zim +{ + namespace writer + { + struct UrlCompare { + bool operator() (const Dirent* d1, const Dirent* d2) const { + return compareUrl(d1, d2); + } + }; + + class Cluster; + class CreatorData + { + public: + typedef std::set UrlSortedDirents; + typedef std::map MimeTypesMap; + typedef std::map RMimeTypesMap; + typedef std::vector MimeTypesList; + typedef std::vector ClusterList; + typedef Queue ClusterQueue; + typedef Queue TaskQueue; + typedef std::vector ThreadList; + + CreatorData(const std::string& fname, bool verbose, + bool withIndex, std::string language, + Compression compression, + size_t clusterSize); + virtual ~CreatorData(); + + void addDirent(Dirent* dirent); + void addItemData(Dirent* dirent, std::unique_ptr provider, bool compressContent); + + Dirent* createDirent(NS ns, const std::string& path, const std::string& mimetype, const std::string& title); + Dirent* createItemDirent(const Item* item); + Dirent* createRedirectDirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath); + Cluster* closeCluster(bool compressed); + + void setEntryIndexes(); + void resolveRedirectIndexes(); + void resolveMimeTypes(); + + uint16_t getMimeTypeIdx(const std::string& mimeType); + const std::string& getMimeType(uint16_t mimeTypeIdx) const; + + void quitAllThreads(); + + DirentPool pool; + + UrlSortedDirents dirents; + UrlSortedDirents unresolvedRedirectDirents; + Dirent* mainPageDirent; + + MimeTypesMap mimeTypesMap; + RMimeTypesMap rmimeTypesMap; + MimeTypesList mimeTypesList; + uint16_t nextMimeIdx = 0; + + ClusterList clustersList; + ClusterQueue clusterToWrite; + TaskQueue taskList; + ThreadList workerThreads; + std::thread writerThread; + const Compression compression; + std::string zimName; + std::string tmpFileName; + bool isEmpty = true; + size_t clusterSize; + Cluster *compCluster = nullptr; + Cluster *uncompCluster = nullptr; + int out_fd; + + bool withIndex; + std::string indexingLanguage; + + std::shared_ptr mp_titleListingHandler; + offset_t m_titleListBlobOffset; // The offset the title list blob, + // related to the beginning of the start of cluster's data. + std::vector> m_direntHandlers; + void handle(Dirent* dirent, const Hints& hints = Hints()) { + for(auto& handler: m_direntHandlers) { + handler->handle(dirent, hints); + } + } + void handle(Dirent* dirent, std::shared_ptr item) { + for(auto& handler: m_direntHandlers) { + handler->handle(dirent, item); + } + } + + // Some stats + bool verbose; + entry_index_type nbItems; + entry_index_type nbRedirectItems; + entry_index_type nbCompItems; + entry_index_type nbUnCompItems; + cluster_index_type nbClusters; + cluster_index_type nbCompClusters; + cluster_index_type nbUnCompClusters; + time_t start_time; + + cluster_index_t clusterCount() const + { return cluster_index_t(clustersList.size()); } + + entry_index_t itemCount() const + { return entry_index_t(dirents.size()); } + }; + + } + +} + +#endif // ZIM_WRITER_CREATOR_DATA_H diff --git a/src/writer/defaultIndexData.h b/src/writer/defaultIndexData.h new file mode 100644 index 0000000..9924c32 --- /dev/null +++ b/src/writer/defaultIndexData.h @@ -0,0 +1,130 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_DEFAULTINDEXDATA_H +#define ZIM_WRITER_DEFAULTINDEXDATA_H + +#include +#include "xapian/myhtmlparse.h" +#include "../tools.h" + +#include +#include +#include + +namespace zim +{ + namespace writer + { + class DefaultIndexData : public IndexData { + public: + DefaultIndexData(std::unique_ptr contentProvider, const std::string& title) + : m_initialized(false), + mp_contentProvider(std::move(contentProvider)), +#if defined(ENABLE_XAPIAN) + m_title(zim::removeAccents(title)), +#else + m_title(""), +#endif + m_hasIndexData(false), + m_content(""), + m_keywords(""), + m_wordCount(0), + m_geoPosition(std::make_tuple(false, 0, 0)) + {} + + void initialize() const { + if (m_initialized) { + return; + } + std::lock_guard lock(m_initLock); + // We have to do a double check to be sure that two call on a un-initialized object + // will not be initiialized twice. + if (m_initialized) { + return; + } +#if defined(ENABLE_XAPIAN) + std::ostringstream ss; + while (true) { + auto blob = mp_contentProvider->feed(); + if(blob.size() == 0) { + break; + } + ss << blob; + } + MyHtmlParser htmlParser; + try { + htmlParser.parse_html(ss.str(), "UTF-8", true); + } catch(...) {} + m_hasIndexData = !htmlParser.dump.empty() && htmlParser.indexing_allowed && (htmlParser.dump.find("NOINDEX") == std::string::npos); + m_content = zim::removeAccents(htmlParser.dump); + m_keywords = zim::removeAccents(htmlParser.keywords); + m_wordCount = zim::countWords(htmlParser.dump); + if(htmlParser.has_geoPosition) { + m_geoPosition = std::make_tuple(true, htmlParser.latitude, htmlParser.longitude); + } +#endif + m_initialized = true; + } + + bool hasIndexData() const { + initialize(); + return m_hasIndexData; + } + + std::string getTitle() const { + return m_title; + } + + std::string getContent() const { + initialize(); + return m_content; + } + + std::string getKeywords() const { + initialize(); + return m_keywords; + } + + uint32_t getWordCount() const { + initialize(); + return m_wordCount; + } + + GeoPosition getGeoPosition() const + { + initialize(); + return m_geoPosition; + } + + private: + mutable std::atomic m_initialized; + mutable std::mutex m_initLock; + std::unique_ptr mp_contentProvider; + std::string m_title; + mutable bool m_hasIndexData; + mutable std::string m_content; + mutable std::string m_keywords; + mutable uint32_t m_wordCount; + mutable GeoPosition m_geoPosition; + }; + } +} + +#endif // ZIM_WRITER_DEFAULTINDEXDATA_H diff --git a/src/writer/dirent.cpp b/src/writer/dirent.cpp new file mode 100644 index 0000000..f5df2ec --- /dev/null +++ b/src/writer/dirent.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (C) 2020 Matthieu Gautier + * Copyright (C) 2006 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "_dirent.h" +#include +#include "buffer.h" +#include "endian_tools.h" +#include "log.h" +#include +#include +#ifdef _WIN32 +# include +#else +# include +# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \ +{throw std::runtime_error("Error writing");} +#endif + +log_define("zim.dirent") + +namespace zim { +namespace writer { + +char NsAsChar(NS ns) { + switch(ns) { + case NS::C: return 'C'; + case NS::M: return 'M'; + case NS::W: return 'W'; + case NS::X: return 'X'; + } + throw std::runtime_error("Invalid namespace value."); +} + +// Creator for a "classic" dirent +Dirent::Dirent(NS ns, const std::string& path, const std::string& title, uint16_t mimetype) + : pathTitle(path, title), + mimeType(mimetype), + idx(0), + info(DirentInfo::Direct()), + offset(0), + _ns(static_cast(ns)), + removed(false), + frontArticle(false) +{} + +// Creator for a "redirection" dirent +Dirent::Dirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath) + : pathTitle(path, title), + mimeType(redirectMimeType), + idx(0), + info(std::move(DirentInfo::Redirect(targetNs, targetPath))), + offset(0), + _ns(static_cast(ns)), + removed(false), + frontArticle(false) +{} + +NS Dirent::getRedirectNs() const { + return info.getRedirect().ns; +} + +std::string Dirent::getRedirectPath() const { + return info.getRedirect().targetPath; +} + +void Dirent::write(int out_fd) const +{ + const static char zero = 0; + union + { + char d[16]; + long a; + } header; + zim::toLittleEndian(getMimeType(), header.d); + header.d[2] = 0; // parameter size + header.d[3] = NsAsChar(getNamespace()); + + log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size()); + + zim::toLittleEndian(getVersion(), header.d + 4); + + if (isRedirect()) + { + zim::toLittleEndian(getRedirectIndex().v, header.d + 8); + _write(out_fd, header.d, 12); + } + else + { + zim::toLittleEndian(zim::cluster_index_type(getClusterNumber()), header.d + 8); + zim::toLittleEndian(zim::blob_index_type(getBlobNumber()), header.d + 12); + _write(out_fd, header.d, 16); + } + + _write(out_fd, pathTitle.data(), pathTitle.size()); + _write(out_fd, &zero, 1); +} + +} +} diff --git a/src/writer/direntPool.h b/src/writer/direntPool.h new file mode 100644 index 0000000..227fbb3 --- /dev/null +++ b/src/writer/direntPool.h @@ -0,0 +1,89 @@ +/* + * Copyright (C) 2019-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_DIRENTPOOL_H +#define ZIM_WRITER_DIRENTPOOL_H + +#include "debug.h" +#include "_dirent.h" + +namespace zim +{ + namespace writer { + class DirentPool { + private: + std::vector pools; + uint16_t direntIndex; + + void allocate_new_pool() { + pools.push_back(reinterpret_cast(new char[sizeof(Dirent)*0xFFFF])); + direntIndex = 0; + } + static void destroyPoolBlock(Dirent* pool, uint16_t count=0xFFFF) { + for (auto i = 0U; i < count; i++) { + try { + pool[i].~Dirent(); + } catch (...){ /*discard*/ } + } + delete [] (reinterpret_cast(pool)); + } + + + public: + DirentPool() : + direntIndex(0xFFFF) + {} + DirentPool(const DirentPool&) = delete; + DirentPool& operator=(const DirentPool&) = delete; + ~DirentPool() { + auto nbPools = pools.size(); + if (nbPools == 0) { + return; + } + // Delete all but last pools (add call the destructors of the dirents) + for (auto i = 0U; i + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_WRITER_HANDLER_H +#define OPENZIM_LIBZIM_WRITER_HANDLER_H + +#include +#include +#include + +#include + +namespace zim { +namespace writer { + +class CreatorData; +class ContentProvider; +class Dirent; + +/** + * DirentHandler is used to add "extra" handling on dirent/item. + * + * The main purpose of the handle is to "see" all dirents corresponding to user entries + * and generate it's own dirent/item. + * + * Classical use cases are : + * - Generating a index of the item (xapianIndex) + * - Generating a listing of the item (all item or "main" entries only) + * - Count mimetypes + * - ... + * + * The workflow is the following: + * - Start the handler with `start()`. + * - Pass dirents to handle using `handle()`. + * If a handler has to handle itself, it has to do it itself before (in start/stop, ...) + * The handlers will NOT have dirents of other handlers passed. + * (Exception made for titleListingHandle) + * - Get the dirents associated to the handler using `createDirents()`. + * Handler must created dirents if entry/entries associated to it must be created. + * It may create several dirents if several entries must be created. + * It may return a empty vector (no dirent) if no entry must be created (empty listing,...). + * - All dirents are correctly set (redirect resolved, index and mimetype set, ...) + * - Stop the handler with `stop()`. + * - Get the content of the handler is taken using `getContentProviders`. + * Handle MUST returns the same number of contentProvider that the number of dirents it has returned. + * + * While it seems that DirentHandler is dynamically (de)activated by user it is not. + * This is purelly a internal structure to simplify the internal architecture of the writer. + */ +class DirentHandler { + public: + explicit DirentHandler(CreatorData* data); + virtual ~DirentHandler() = default; + using ContentProviders = std::vector>; + using Dirents = std::vector; + + virtual void start() = 0; + virtual void stop() = 0; + virtual bool isCompressible() = 0; + const Dirents& getDirents() { + if (!m_direntsCreated) { + m_dirents = createDirents(); + m_direntsCreated = true; + } + return m_dirents; + } + virtual ContentProviders getContentProviders() const = 0; + + /* + * Handle a dirent/item. + * + * item may be nullptr (dirent is a redirect or in special case) + */ + virtual void handle(Dirent* dirent, std::shared_ptr item) = 0; + virtual void handle(Dirent* dirent, const Hints& hints) = 0; + + protected: + virtual Dirents createDirents() const = 0; + DirentHandler() = default; + + private: + Dirents m_dirents; + bool m_direntsCreated {false}; +}; + +} +} + +#endif // OPENZIM_LIBZIM_WRITER_HANDLER_H diff --git a/src/writer/item.cpp b/src/writer/item.cpp new file mode 100644 index 0000000..3ba745f --- /dev/null +++ b/src/writer/item.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * Copyright (C) 2009 Tommi Maekitalo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include +#include +#include "defaultIndexData.h" + +namespace zim +{ + namespace writer + { + std::shared_ptr Item::getIndexData() const + { + if (getMimeType().find("text/html")!=0) { + return nullptr; + } + + auto provider = getContentProvider(); + return std::make_shared(std::move(provider), getTitle()); + } + + Hints Item::getHints() const { + return Hints(); + } + + Hints Item::getAmendedHints() const { + auto hints = getHints(); + + // If not FRONT_ARTICLE hints is given, determine it from the mimetype. + if (hints.find(FRONT_ARTICLE) == hints.end()) { + hints[FRONT_ARTICLE] = (getMimeType().find("text/html") == 0); + } + + // If not COMPRESS hints is given, determine it from the mimetype. + if (hints.find(COMPRESS) == hints.end()) { + hints[COMPRESS] = isCompressibleMimetype(getMimeType()); + } + return hints; + } + + std::unique_ptr StringItem::getContentProvider() const + { + auto shared_string = std::shared_ptr(shared_from_this(), &content); + return std::unique_ptr(new SharedStringProvider(shared_string)); + } + + std::unique_ptr FileItem::getContentProvider() const + { + return std::unique_ptr(new FileProvider(filepath)); + } + + + } +} diff --git a/src/writer/queue.h b/src/writer/queue.h new file mode 100644 index 0000000..454087e --- /dev/null +++ b/src/writer/queue.h @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2016-2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_QUEUE_H +#define OPENZIM_LIBZIM_QUEUE_H + +#define MAX_QUEUE_SIZE 10 + +#include +#include +#include "../tools.h" + +template +class Queue { + public: + Queue() = default; + virtual ~Queue() = default; + virtual bool isEmpty(); + virtual size_t size(); + virtual void pushToQueue(const T& element); + virtual bool getHead(T &element); + virtual bool popFromQueue(T &element); + + protected: + std::queue m_realQueue; + std::mutex m_queueMutex; + + private: + // Make this queue non copyable + Queue(const Queue&); + Queue& operator=(const Queue&); +}; + +template +bool Queue::isEmpty() { + std::lock_guard l(m_queueMutex); + return m_realQueue.empty(); +} + +template +size_t Queue::size() { + std::lock_guard l(m_queueMutex); + return m_realQueue.size(); +} + +template +void Queue::pushToQueue(const T &element) { + unsigned int wait = 0; + unsigned int queueSize = 0; + + do { + zim::microsleep(wait); + queueSize = size(); + wait += 10; + } while (queueSize > MAX_QUEUE_SIZE); + + std::lock_guard l(m_queueMutex); + m_realQueue.push(element); +} + +template +bool Queue::getHead(T &element) { + std::lock_guard l(m_queueMutex); + if (m_realQueue.empty()) { + return false; + } + element = m_realQueue.front(); + return true; +} + +template +bool Queue::popFromQueue(T &element) { + std::lock_guard l(m_queueMutex); + if (m_realQueue.empty()) { + return false; + } + + element = m_realQueue.front(); + m_realQueue.pop(); + + return true; +} + +#endif // OPENZIM_LIBZIM_QUEUE_H diff --git a/src/writer/tinyString.h b/src/writer/tinyString.h new file mode 100644 index 0000000..bb8bde9 --- /dev/null +++ b/src/writer/tinyString.h @@ -0,0 +1,120 @@ +/* + * Copyright (C) 2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#ifndef ZIM_WRITER_TINYSTRING_H +#define ZIM_WRITER_TINYSTRING_H + +#include "../zim_types.h" +#include + +namespace zim +{ + namespace writer { + class TinyString { + public: // functions + TinyString() : + m_data(nullptr), + m_size(0) + {} + TinyString(const std::string& s) : + m_data(new char[(uint16_t)s.size()]), + m_size(s.size()) + { + if (s.size() >= 0xFFFF) { + throw std::runtime_error("String len is too big"); + } + std::memcpy(m_data, s.data(), m_size); + } + TinyString(TinyString&& t): + m_data(t.m_data), + m_size(t.m_size) + { + t.m_data = nullptr; + t.m_size = 0; + }; + TinyString(const TinyString& t) = delete; + ~TinyString() { + if (m_data) { + delete[] m_data; + m_data = nullptr; + } + } + operator std::string() const { return std::string(m_data, m_size); } + bool empty() const { return m_size == 0; } + size_t size() const { return m_size; } + const char* const data() const { return m_data; } + bool operator==(const TinyString& other) const { + return (m_size == other.m_size) && (std::memcmp(m_data, other.m_data, m_size) == 0); + } + bool operator<(const TinyString& other) const { + auto min_size = std::min(m_size, other.m_size); + auto ret = std::memcmp(m_data, other.m_data, min_size); + if (ret == 0) { + return m_size < other.m_size; + } else { + return ret < 0; + } + } + + protected: // members + char* m_data; + uint16_t m_size; + } PACKED; + + class PathTitleTinyString : public TinyString { + public: + PathTitleTinyString() : TinyString() {} + PathTitleTinyString(const std::string& path, const std::string& title) + : TinyString(PathTitleTinyString::concat(path, title)) + {} + + static std::string concat(const std::string& path, const std::string& title) { + std::string result(path.data(), path.size()+1); + if ( title != path ) { + result += title; + } + return result; + } + std::string getPath() const { + if (m_size == 0) { + return std::string(); + } + return std::string(m_data); + } + std::string getTitle(bool storedOnly) const { + if (m_size == 0) { + return std::string(); + } + auto title_start = std::strlen(m_data) + 1; + if (title_start == m_size) { + if (storedOnly) { + return std::string(); // return empty title + } else { + return std::string(m_data); // return the path as a title + } + } else { + return std::string(m_data+title_start, m_size-title_start); + } + } + } PACKED; + } +} + +#endif // ZIM_WRITER_TINYSTRING_H + diff --git a/src/writer/titleListingHandler.cpp b/src/writer/titleListingHandler.cpp new file mode 100644 index 0000000..b6507db --- /dev/null +++ b/src/writer/titleListingHandler.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "titleListingHandler.h" +#include "creatordata.h" + +#include "../endian_tools.h" + +#include +#include + +using namespace zim::writer; + +namespace { + +class ListingProvider : public ContentProvider { + public: + ListingProvider(const TitleListingHandler::Dirents* dirents, bool frontOnly) + : mp_dirents(dirents), + m_it(dirents->begin()), + m_frontOnly(frontOnly) + {} + + zim::size_type getSize() const override { + if (m_frontOnly) { + auto nbFrontArticles = std::count_if(mp_dirents->begin(), mp_dirents->end(), [](Dirent* d) { return d->isFrontArticle();}); + return nbFrontArticles * sizeof(zim::entry_index_type); + } else { + return mp_dirents->size() * sizeof(zim::entry_index_type); + } + } + + zim::Blob feed() override { + if (m_frontOnly) { + while (m_it != mp_dirents->end() && !(*m_it)->isFrontArticle()) { + m_it++; + } + } + if (m_it == mp_dirents->end()) { + return zim::Blob(nullptr, 0); + } + zim::toLittleEndian((*m_it)->getIdx().v, buffer); + m_it++; + return zim::Blob(buffer, sizeof(zim::entry_index_type)); + } + + private: + const TitleListingHandler::Dirents* mp_dirents; + char buffer[sizeof(zim::entry_index_type)]; + TitleListingHandler::Dirents::const_iterator m_it; + bool m_frontOnly; +}; + +} // end of anonymous namespace + +TitleListingHandler::TitleListingHandler(CreatorData* data) + : mp_creatorData(data), + m_hasFrontArticles(false) +{} + +TitleListingHandler::~TitleListingHandler() = default; + +void TitleListingHandler::start() { +} + +void TitleListingHandler::stop() { + m_handledDirents.erase( + std::remove_if(m_handledDirents.begin(), m_handledDirents.end(), [](const Dirent* d) { return d->isRemoved(); }), + m_handledDirents.end()); + std::sort(m_handledDirents.begin(), m_handledDirents.end(), TitleCompare()); +} + +DirentHandler::Dirents TitleListingHandler::createDirents() const { + Dirents ret; + ret.push_back(mp_creatorData->createDirent(NS::X, "listing/titleOrdered/v0", "application/octet-stream+zimlisting", "")); + if (m_hasFrontArticles) { + ret.push_back(mp_creatorData->createDirent(NS::X, "listing/titleOrdered/v1", "application/octet-stream+zimlisting", "")); + } + return ret; +} + +DirentHandler::ContentProviders TitleListingHandler::getContentProviders() const { + ContentProviders ret; + ret.push_back(std::unique_ptr(new ListingProvider(&m_handledDirents, false))); + if (m_hasFrontArticles) { + ret.push_back(std::unique_ptr(new ListingProvider(&m_handledDirents, true))); + } + return ret; +} + +void TitleListingHandler::handle(Dirent* dirent, std::shared_ptr item) +{ + handle(dirent, item->getAmendedHints()); +} + +void TitleListingHandler::handle(Dirent* dirent, const Hints& hints) +{ + m_handledDirents.push_back(dirent); + + // By definition, dirent not in `C` namespace are not FRONT_ARTICLE + if (dirent->getNamespace() != NS::C) { + return; + } + + try { + if(bool(hints.at(FRONT_ARTICLE))) { + m_hasFrontArticles = true; + dirent->setFrontArticle(); + } + } catch(std::out_of_range&) {} +} + diff --git a/src/writer/titleListingHandler.h b/src/writer/titleListingHandler.h new file mode 100644 index 0000000..65f19d5 --- /dev/null +++ b/src/writer/titleListingHandler.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_LISTING_HANDLER_H +#define OPENZIM_LIBZIM_LISTING_HANDLER_H + +#include "handler.h" +#include "_dirent.h" + +#include + +namespace zim { +namespace writer { + +struct TitleCompare { + bool operator() (const Dirent* d1, const Dirent* d2) const { + return compareTitle(d1, d2); + } +}; + +// This handler is in charge of handling titles. +// It will create the "classic" old V0 title listing (for ALL entries) but also +// the V1 title listing (for front article only). +class TitleListingHandler : public DirentHandler { + public: + explicit TitleListingHandler(CreatorData* data); + virtual ~TitleListingHandler(); + + void start() override; + void stop() override; + bool isCompressible() override { return false; } + ContentProviders getContentProviders() const override; + void handle(Dirent* dirent, std::shared_ptr item) override; + void handle(Dirent* dirent, const Hints& hints) override; + + protected: + Dirents createDirents() const override; + CreatorData* mp_creatorData; + Dirents m_handledDirents; + bool m_hasFrontArticles; +}; +} +} + +#endif // OPENZIM_LIBZIM_LISTING_HANDLER_H diff --git a/src/writer/workers.cpp b/src/writer/workers.cpp new file mode 100644 index 0000000..d48418f --- /dev/null +++ b/src/writer/workers.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2019-2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "workers.h" +#include "cluster.h" +#include "creatordata.h" + +#include "../tools.h" + +#ifdef _WIN32 +#include +#else +#include +#endif + +namespace zim +{ + namespace writer + { + + void* taskRunner(void* arg) { + auto creatorData = static_cast(arg); + Task* task; + unsigned int wait = 0; + + while(true) { + microsleep(wait); + wait += 100; + if (creatorData->taskList.popFromQueue(task)) { + if (task == nullptr) { + return nullptr; + } + task->run(creatorData); + delete task; + wait = 0; + } + } + return nullptr; + } + + void* clusterWriter(void* arg) { + auto creatorData = static_cast(arg); + Cluster* cluster; + unsigned int wait = 0; + while(true) { + microsleep(wait); + wait += 100; + if(creatorData->clusterToWrite.getHead(cluster)) { + if (cluster == nullptr) { + // All cluster writen, we can quit + return nullptr; + } + if (not cluster->isClosed()) { + continue; + } + creatorData->clusterToWrite.popFromQueue(cluster); + cluster->setOffset(offset_t(lseek(creatorData->out_fd, 0, SEEK_CUR))); + cluster->write(creatorData->out_fd); + cluster->clear_data(); + wait = 0; + } + } + return nullptr; + } + } +} diff --git a/src/writer/workers.h b/src/writer/workers.h new file mode 100644 index 0000000..2e9d68c --- /dev/null +++ b/src/writer/workers.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2019-2020 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_WORKERS_H +#define OPENZIM_LIBZIM_WORKERS_H + +namespace zim { +namespace writer { + +class CreatorData; + +class Task { + public: + Task() = default; + virtual ~Task() = default; + + virtual void run(CreatorData* data) = 0; +}; + +void* taskRunner(void* data); +void* clusterWriter(void* data); + +} +} + +#endif // OPENZIM_LIBZIM_WORKERS_H diff --git a/src/writer/xapianHandler.cpp b/src/writer/xapianHandler.cpp new file mode 100644 index 0000000..407ccbb --- /dev/null +++ b/src/writer/xapianHandler.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2020-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "xapianHandler.h" +#include "xapianIndexer.h" +#include "xapianWorker.h" +#include "creatordata.h" + +#include + +using namespace zim::writer; + +XapianHandler::XapianHandler(CreatorData* data, bool withFulltextIndex) + : mp_fulltextIndexer(withFulltextIndex ? new XapianIndexer(data->zimName+"_fulltext.idx", data->indexingLanguage, IndexingMode::FULL, true) : nullptr), + mp_titleIndexer(new XapianIndexer(data->zimName+"_title.idx", data->indexingLanguage, IndexingMode::TITLE, true)), + mp_creatorData(data) +{} + +XapianHandler::~XapianHandler() = default; + +void XapianHandler::start() { + if (mp_fulltextIndexer) { + mp_fulltextIndexer->indexingPrelude(); + } + mp_titleIndexer->indexingPrelude(); +} + +void XapianHandler::stop() { + // We need to wait that all indexation tasks have been done before closing the + // xapian database. + if (mp_fulltextIndexer) { + IndexTask::waitNoMoreTask(); + mp_fulltextIndexer->indexingPostlude(); + } + mp_titleIndexer->indexingPostlude(); +} + +DirentHandler::Dirents XapianHandler::createDirents() const { + // Wait for all task to be done before checking if we are empty. + Dirents ret; + if (mp_fulltextIndexer) { + IndexTask::waitNoMoreTask(); + if (!mp_fulltextIndexer->is_empty()) { + ret.push_back(mp_creatorData->createDirent(NS::X, "fulltext/xapian", "application/octet-stream+xapian", "")); + } + } + if (!mp_titleIndexer->is_empty()) { + ret.push_back(mp_creatorData->createDirent(NS::X, "title/xapian", "application/octet-stream+xapian", "")); + } + return ret; +} + +DirentHandler::ContentProviders XapianHandler::getContentProviders() const { + ContentProviders ret; + if (mp_fulltextIndexer && !mp_fulltextIndexer->is_empty()) { + ret.push_back(std::unique_ptr(new FileProvider(mp_fulltextIndexer->getIndexPath()))); + } + if (!mp_titleIndexer->is_empty()) { + ret.push_back(std::unique_ptr(new FileProvider(mp_titleIndexer->getIndexPath()))); + } + return ret; +} + +void XapianHandler::indexTitle(Dirent* dirent) { + auto title = dirent->getRealTitle(); + if (title.empty()) { + return; + } + auto path = dirent->getPath(); + if (dirent->isRedirect()) { + auto redirectPath = dirent->getRedirectPath(); + mp_titleIndexer->indexTitle(path, title, redirectPath); + } else { + mp_titleIndexer->indexTitle(path, title); + } +} + +void XapianHandler::handle(Dirent* dirent, const Hints& hints) +{ + if (dirent->getNamespace() != NS::C) { + return; + } + + try { + if (bool(hints.at(FRONT_ARTICLE))) { + indexTitle(dirent); + } + } catch(std::out_of_range&) {} +} + +void XapianHandler::handle(Dirent* dirent, std::shared_ptr item) +{ + if (dirent->getNamespace() != NS::C) { + return; + } + + // Title index. + handle(dirent, item->getAmendedHints()); + + // FullText index + if (mp_fulltextIndexer) { + auto indexData = item->getIndexData(); + if (!indexData) { + return; + } + auto path = dirent->getPath(); + mp_creatorData->taskList.pushToQueue(new IndexTask(indexData, path, mp_fulltextIndexer.get())); + } +} + diff --git a/src/writer/xapianHandler.h b/src/writer/xapianHandler.h new file mode 100644 index 0000000..86a6773 --- /dev/null +++ b/src/writer/xapianHandler.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_XAPIAN_HANDLER_H +#define OPENZIM_LIBZIM_XAPIAN_HANDLER_H + +#include "handler.h" + +namespace zim { +namespace writer { + +class XapianIndexer; + +class XapianHandler : public DirentHandler { + public: + XapianHandler(CreatorData* data, bool withFullTextIndex); + virtual ~XapianHandler(); + + void start() override; + void stop() override; + bool isCompressible() override { return false; } + ContentProviders getContentProviders() const override; + void handle(Dirent* dirent, std::shared_ptr item) override; + void handle(Dirent* dirent, const Hints& hints) override; + + protected: + Dirents createDirents() const override; + + private: // methods + void indexTitle(Dirent* dirent); + + private: // data + std::unique_ptr mp_fulltextIndexer; + std::unique_ptr mp_titleIndexer; + CreatorData* mp_creatorData; +}; + +} +} + +#endif // OPENZIM_LIBZIM_XAPIAN_WORKER_H diff --git a/src/writer/xapianIndexer.cpp b/src/writer/xapianIndexer.cpp new file mode 100644 index 0000000..ac42681 --- /dev/null +++ b/src/writer/xapianIndexer.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2018-2021 Matthieu Gautier + * Copyright (C) 2011 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "xapianIndexer.h" +#include "libzim-resources.h" +#include "fs.h" +#include "tools.h" +#include "../constants.h" +#include +#include +#include +#include + +using namespace zim::writer; + +/* Constructor */ +XapianIndexer::XapianIndexer(const std::string& indexPath, const std::string& language, IndexingMode indexingMode, const bool verbose) + : indexPath(indexPath), + language(language), + indexingMode(indexingMode) +{ + /* Build ICU Local object to retrieve ISO-639 language code (from + ISO-639-3) */ + icu::Locale languageLocale(language.c_str()); + stemmer_language = languageLocale.getLanguage(); + + /* Read the stopwords */ + std::string stopWord; + try { + this->stopwords = getResource("stopwords/" + language); + } catch(ResourceNotFound& e) {} + std::istringstream file(this->stopwords); + while (std::getline(file, stopWord, '\n')) { + this->stopper.add(stopWord); + } +} + +XapianIndexer::~XapianIndexer() +{ + if (!indexPath.empty()) { + try { +#ifndef _WIN32 +//[TODO] Implement remove for windows + zim::DEFAULTFS::remove(indexPath + ".tmp"); + zim::DEFAULTFS::remove(indexPath); +#endif + } catch (...) { + /* Do not raise */ + } + } +} + +/* + * `valuesmap` is a metadata associated with the Xapian database. We are using it + * to attach slot numbers of each document in the index to the value they are storing. + * These values and slot numbers are used in collapsing, filtering etc. + * + * Title index: + * Slot 0: Title of the article. Used in collapsing articles with same name. + * Slot 1: path/redirectPath of the article. Used in collapsing duplicates(redirects). + * + * Fulltext Index: + * Slot 0: Title of the article. Used in collapsing articles with same name. + * Slot 1: Word count of the article. + * Slot 2: Geo position of the article. Used for geo-filtering. + * + * `kind` metadata indicate whether the database is a title or a fulltext index. + * + * `data` metadata indicate the type of data stored in the index. A value of "fullPath" + * means the data stores the complete path with a namespace. + */ + +void XapianIndexer::indexingPrelude() +{ + writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE | Xapian::DB_NO_TERMLIST); + + switch (indexingMode) { + case IndexingMode::TITLE: + writableDatabase.set_metadata("valuesmap", "title:0;targetPath:1"); + writableDatabase.set_metadata("kind", "title"); + writableDatabase.set_metadata("data", "fullPath"); + break; + case IndexingMode::FULL: + writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1;geo.position:2"); + writableDatabase.set_metadata("kind", "fulltext"); + writableDatabase.set_metadata("data", "fullPath"); + break; + } + writableDatabase.set_metadata("language", language); + writableDatabase.set_metadata("stopwords", stopwords); +} + +/* + * For title index, index the full path with namespace as data of the document. + * The targetPath in valuesmap will store the path without namespace. + * TODO: + * Currently for title index we are storing path twice (redirectPath/path in + * valuesmap and path in index data). In the future, we want to keep only one of + * these(index data if possible) to reduce index size while supporting the + * collapse on path feature. + */ + +void XapianIndexer::indexTitle(const std::string& path, const std::string& title, const std::string& targetPath) +{ + assert(indexingMode == IndexingMode::TITLE); + Xapian::Stem stemmer; + Xapian::TermGenerator indexer; + try { + stemmer = Xapian::Stem(stemmer_language); + indexer.set_stemmer(stemmer); + indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME); + } catch (...) {} + Xapian::Document currentDocument; + currentDocument.clear_values(); + + std::string fullPath = "C/" + path; + currentDocument.set_data(fullPath); + indexer.set_document(currentDocument); + + std::string unaccentedTitle = zim::removeAccents(title); + + currentDocument.add_value(0, title); + if (targetPath.empty()) { + currentDocument.add_value(1, path); + } else { + currentDocument.add_value(1, targetPath); + } + + if (!unaccentedTitle.empty()) { + std::string anchoredTitle = ANCHOR_TERM + unaccentedTitle; + indexer.index_text(anchoredTitle, 1); + } + + /* add to the database */ + writableDatabase.add_document(currentDocument); + empty = false; +} + +void XapianIndexer::indexingPostlude() +{ + this->writableDatabase.commit(); + this->writableDatabase.compact(indexPath, Xapian::DBCOMPACT_SINGLE_FILE|Xapian::Compactor::FULLER); + this->writableDatabase.close(); +} + diff --git a/src/writer/xapianIndexer.h b/src/writer/xapianIndexer.h new file mode 100644 index 0000000..ffed3a7 --- /dev/null +++ b/src/writer/xapianIndexer.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2018-2021 Matthieu Gautier + * Copyright (C) 2011 Emmanuel Engelhart + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef LIBZIM_WRITER_XAPIANINDEXER_H +#define LIBZIM_WRITER_XAPIANINDEXER_H + +#include + +#include +#include +#include + + +namespace zim { +namespace writer { + +class IndexTask; + +enum class IndexingMode { + TITLE, + FULL +}; + +class XapianIndexer +{ + public: + XapianIndexer(const std::string& indexPath, const std::string& language, IndexingMode mode, bool verbose); + virtual ~XapianIndexer(); + std::string getIndexPath() { return indexPath; } + void indexingPrelude(); + void indexingPostlude(); + bool is_empty() { return empty; } + + void indexTitle(const std::string& path, const std::string& title, const std::string& targetPath = ""); + + protected: + Xapian::WritableDatabase writableDatabase; + bool empty {true}; + std::string stemmer_language; + Xapian::SimpleStopper stopper; + std::string indexPath; + std::string language; + std::string stopwords; + IndexingMode indexingMode; + + friend class zim::writer::IndexTask; +}; + +} +} + +#endif // LIBZIM_WRITER_XAPIANINDEXER_H diff --git a/src/writer/xapianWorker.cpp b/src/writer/xapianWorker.cpp new file mode 100644 index 0000000..3f23027 --- /dev/null +++ b/src/writer/xapianWorker.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2021 Maneesh P M + * Copyright (C) 2020-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied + * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and + * NON-INFRINGEMENT. See the GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + * + */ + +#include "xapianWorker.h" +#include "creatordata.h" + +#include "xapianIndexer.h" + +#include +#include +#include + +static std::mutex s_dbaccessLock; +std::atomic zim::writer::IndexTask::waiting_task(0); + +namespace zim +{ + namespace writer + { + + const unsigned int keywordsBoostFactor = 3; + inline unsigned int getTitleBoostFactor(const unsigned int contentLength) + { + return contentLength / 500 + 1; + } + + void IndexTask::waitNoMoreTask() { + unsigned int wait = 0; + do { + microsleep(wait); + wait += 10; + } while (waiting_task.load() > 0); + } + + void IndexTask::run(CreatorData* data) { + if (!mp_indexData->hasIndexData()) { + return; + } + Xapian::Stem stemmer; + Xapian::TermGenerator indexer; + try { + stemmer = Xapian::Stem(mp_indexer->stemmer_language); + indexer.set_stemmer(stemmer); + indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL); + } catch (...) { + // No stemming for language. + } + indexer.set_stopper(&mp_indexer->stopper); + indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL); + + Xapian::Document document; + indexer.set_document(document); + + std::string fullPath = "C/" + m_path; + document.set_data(fullPath); + document.add_value(0, mp_indexData->getTitle()); + + std::stringstream countWordStringStream; + countWordStringStream << mp_indexData->getWordCount(); + document.add_value(1, countWordStringStream.str()); + + auto geoInfo = mp_indexData->getGeoPosition(); + if (std::get<0>(geoInfo)) { + auto geoPosition = Xapian::LatLongCoord( + std::get<1>(geoInfo), std::get<2>(geoInfo)).serialise(); + document.add_value(2, geoPosition); + } + + /* Index the content */ + auto indexContent = mp_indexData->getContent(); + if (!indexContent.empty()) { + indexer.index_text_without_positions(indexContent); + } + + /* Index the title */ + auto indexTitle = mp_indexData->getTitle(); + if (!indexTitle.empty()) { + indexer.index_text_without_positions( + indexTitle, getTitleBoostFactor(indexContent.size())); + } + + /* Index the keywords */ + auto indexKeywords = mp_indexData->getKeywords(); + if (!indexKeywords.empty()) { + indexer.index_text_without_positions(indexKeywords, keywordsBoostFactor); + } + + std::lock_guard l(s_dbaccessLock); + mp_indexer->writableDatabase.add_document(document); + mp_indexer->empty = false; + } + } +} diff --git a/src/writer/xapianWorker.h b/src/writer/xapianWorker.h new file mode 100644 index 0000000..4d6ad96 --- /dev/null +++ b/src/writer/xapianWorker.h @@ -0,0 +1,64 @@ +/* + * Copyright (C) 2020-2021 Matthieu Gautier + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef OPENZIM_LIBZIM_XAPIAN_WORKER_H +#define OPENZIM_LIBZIM_XAPIAN_WORKER_H + +#include +#include +#include "workers.h" +#include + +namespace zim { +namespace writer { + +class Item; +class XapianIndexer; + +class IndexTask : public Task { + public: + IndexTask(const IndexTask&) = delete; + IndexTask& operator=(const IndexTask&) = delete; + IndexTask(std::shared_ptr indexData, const std::string& path, XapianIndexer* indexer) : + mp_indexData(indexData), + m_path(path), + mp_indexer(indexer) + { + ++waiting_task; + } + virtual ~IndexTask() + { + --waiting_task; + } + + static void waitNoMoreTask(); + + virtual void run(CreatorData* data); + static std::atomic waiting_task; + + private: + std::shared_ptr mp_indexData; + std::string m_path; + XapianIndexer* mp_indexer; +}; + +} +} + +#endif // OPENZIM_LIBZIM_XAPIAN_WORKER_H diff --git a/src/xapian/htmlparse.cc b/src/xapian/htmlparse.cc new file mode 100644 index 0000000..447023f --- /dev/null +++ b/src/xapian/htmlparse.cc @@ -0,0 +1,376 @@ +/* htmlparse.cc: simple HTML parser for omega indexer + * + * Copyright 1999,2000,2001 BrightStation PLC + * Copyright 2001 Ananova Ltd + * Copyright 2002,2006,2007,2008 Olly Betts + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 + * USA + */ + +// #include + +#include "htmlparse.h" + +#include + +// #include "utf8convert.h" + +#include +#include + +#include +#include +#include +#include + +using namespace std; + +inline void +lowercase_string(string &str) +{ + for (string::iterator i = str.begin(); i != str.end(); ++i) { + *i = tolower(static_cast(*i)); + } +} + +map zim::HtmlParser::named_ents; +static std::mutex sInitLock; + +inline static bool +p_notdigit(char c) +{ + return !isdigit(static_cast(c)); +} + +inline static bool +p_notxdigit(char c) +{ + return !isxdigit(static_cast(c)); +} + +inline static bool +p_notalnum(char c) +{ + return !isalnum(static_cast(c)); +} + +inline static bool +p_notwhitespace(char c) +{ + return !isspace(static_cast(c)); +} + +inline static bool +p_nottag(char c) +{ + return !isalnum(static_cast(c)) && + c != '.' && c != '-' && c != ':'; // ':' for XML namespaces. +} + +inline static bool +p_whitespacegt(char c) +{ + return isspace(static_cast(c)) || c == '>'; +} + +inline static bool +p_whitespaceeqgt(char c) +{ + return isspace(static_cast(c)) || c == '=' || c == '>'; +} + +bool +zim::HtmlParser::get_parameter(const string & param, string & value) +{ + map::const_iterator i = parameters.find(param); + if (i == parameters.end()) return false; + value = i->second; + return true; +} + +zim::HtmlParser::HtmlParser() +{ + static const struct ent { const char *n; unsigned int v; } ents[] = { +#include "namedentities.h" + { NULL, 0 } + }; + std::lock_guard l(sInitLock); + if (named_ents.empty()) { + const struct ent *i = ents; + while (i->n) { + named_ents[string(i->n)] = i->v; + ++i; + } + } +} + +void +zim::HtmlParser::decode_entities(string &s) +{ + // We need a const_iterator version of s.end() - otherwise the + // find() and find_if() templates don't work... + string::const_iterator amp = s.begin(), s_end = s.end(); + while ((amp = find(amp, s_end, '&')) != s_end) { + unsigned int val = 0; + string::const_iterator end, p = amp + 1; + if (p != s_end && *p == '#') { + p++; + if (p != s_end && (*p == 'x' || *p == 'X')) { + // hex + p++; + end = find_if(p, s_end, p_notxdigit); + sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val); + } else { + // number + end = find_if(p, s_end, p_notdigit); + val = atoi(s.substr(p - s.begin(), end - p).c_str()); + } + } else { + end = find_if(p, s_end, p_notalnum); + string code = s.substr(p - s.begin(), end - p); + map::const_iterator i; + i = named_ents.find(code); + if (i != named_ents.end()) val = i->second; + } + if (end < s_end && *end == ';') end++; + if (val) { + string::size_type amp_pos = amp - s.begin(); + if (val < 0x80) { + s.replace(amp_pos, end - amp, 1u, char(val)); + } else { + // Convert unicode value val to UTF-8. + char seq[4]; + unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq); + s.replace(amp_pos, end - amp, seq, len); + } + s_end = s.end(); + // We've modified the string, so the iterators are no longer + // valid... + amp = s.begin() + amp_pos + 1; + } else { + amp = end; + } + } +} + +void +zim::HtmlParser::parse_html(const string &body) +{ + in_script = false; + + parameters.clear(); + string::const_iterator start = body.begin(); + + while (true) { + // Skip through until we find an HTML tag, a comment, or the end of + // document. Ignore isolated occurrences of `<' which don't start + // a tag or comment. + string::const_iterator p = start; + while (true) { + p = find(p, body.end(), '<'); + if (p == body.end()) break; + unsigned char ch = *(p + 1); + + // Tag, closing tag, or comment (or SGML declaration). + if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break; + + if (ch == '?') { + // PHP code or XML declaration. + // XML declaration is only valid at the start of the first line. + // FIXME: need to deal with BOMs... + if (p != body.begin() || body.size() < 20) break; + + // XML declaration looks something like this: + // + if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break; + if (strchr(" \t\r\n", p[5]) == NULL) break; + + string::const_iterator decl_end = find(p + 6, body.end(), '?'); + if (decl_end == body.end()) break; + + // Default charset for XML is UTF-8. + charset = "UTF-8"; + + string decl(p + 6, decl_end); + size_t enc = decl.find("encoding"); + if (enc == string::npos) break; + + enc = decl.find_first_not_of(" \t\r\n", enc + 8); + if (enc == string::npos || enc == decl.size()) break; + + if (decl[enc] != '=') break; + + enc = decl.find_first_not_of(" \t\r\n", enc + 1); + if (enc == string::npos || enc == decl.size()) break; + + if (decl[enc] != '"' && decl[enc] != '\'') break; + + char quote = decl[enc++]; + size_t enc_end = decl.find(quote, enc); + + if (enc != string::npos) + charset = decl.substr(enc, enc_end - enc); + + break; + } + p++; + } + + // Process text up to start of tag. + if (p > start) { + string text = body.substr(start - body.begin(), p - start); + // convert_to_utf8(text, charset); + decode_entities(text); + process_text(text); + } + + if (p == body.end()) break; + + start = p + 1; + + if (start == body.end()) break; + + if (*start == '!') { + if (++start == body.end()) break; + if (++start == body.end()) break; + // comment or SGML declaration + if (*(start - 1) == '-' && *start == '-') { + ++start; + string::const_iterator close = find(start, body.end(), '>'); + // An unterminated comment swallows rest of document + // (like Netscape, but unlike MSIE IIRC) + if (close == body.end()) break; + + p = close; + // look for --> + while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-')) + p = find(p + 1, body.end(), '>'); + + if (p != body.end()) { + // Check for htdig's "ignore this bit" comments. + if (p - start == 15 && string(start, p - 2) == "htdig_noindex") { + string::size_type i; + i = body.find("", p + 1 - body.begin()); + if (i == string::npos) break; + start = body.begin() + i + 21; + continue; + } + // If we found --> skip to there. + start = p; + } else { + // Otherwise skip to the first > we found (as Netscape does). + start = close; + } + } else { + // just an SGML declaration, perhaps giving the DTD - ignore it + start = find(start - 1, body.end(), '>'); + if (start == body.end()) break; + } + ++start; + } else if (*start == '?') { + if (++start == body.end()) break; + // PHP - swallow until ?> or EOF + start = find(start + 1, body.end(), '>'); + + // look for ?> + while (start != body.end() && *(start - 1) != '?') + start = find(start + 1, body.end(), '>'); + + // unterminated PHP swallows rest of document (rather arbitrarily + // but it avoids polluting the database when things go wrong) + if (start != body.end()) ++start; + } else { + // opening or closing tag + int closing = 0; + + if (*start == '/') { + closing = 1; + start = find_if(start + 1, body.end(), p_notwhitespace); + } + + p = start; + start = find_if(start, body.end(), p_nottag); + string tag = body.substr(p - body.begin(), start - p); + // convert tagname to lowercase + lowercase_string(tag); + + if (closing) { + closing_tag(tag); + if (in_script && tag == "script") in_script = false; + + /* ignore any bogus parameters on closing tags */ + p = find(start, body.end(), '>'); + if (p == body.end()) break; + start = p + 1; + } else { + // FIXME: parse parameters lazily. + while (start < body.end() && *start != '>') { + string name, value; + + p = find_if(start, body.end(), p_whitespaceeqgt); + + name.assign(body, start - body.begin(), p - start); + + p = find_if(p, body.end(), p_notwhitespace); + + start = p; + if (start != body.end() && *start == '=') { + start = find_if(start + 1, body.end(), p_notwhitespace); + + p = body.end(); + + int quote = *start; + if (quote == '"' || quote == '\'') { + start++; + p = find(start, body.end(), quote); + } + + if (p == body.end()) { + // unquoted or no closing quote + p = find_if(start, body.end(), p_whitespacegt); + } + value.assign(body, start - body.begin(), p - start); + start = find_if(p, body.end(), p_notwhitespace); + + if (!name.empty()) { + // convert parameter name to lowercase + lowercase_string(name); + // in case of multiple entries, use the first + // (as Netscape does) + parameters.insert(make_pair(name, value)); + } + } + } +#if 0 + cout << "<" << tag; + map::const_iterator x; + for (x = parameters.begin(); x != parameters.end(); x++) { + cout << " " << x->first << "=\"" << x->second << "\""; + } + cout << ">\n"; +#endif + opening_tag(tag); + parameters.clear(); + + // In