--- /dev/null
+codecov:
+ notify:
+ require_ci_to_pass: yes
+
+coverage:
+ status:
+ project:
+ default:
+ threshold: 1%
+ patch:
+ default:
+ target: 90%
+ threshold: 0%
+
+ignore:
+ - "test"
+ - "examples"
--- /dev/null
+# These are supported funding model platforms
+
+github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+custom: # https://kiwix.org/support-us/
--- /dev/null
+# Configuration for Move Issues - https://github.com/dessant/move-issues
+
+# Delete the command comment when it contains no other content
+deleteCommand: true
+
+# Close the source issue after moving
+closeSourceIssue: true
+
+# Lock the source issue after moving
+lockSourceIssue: false
+
+# Mention issue and comment authors
+mentionAuthors: true
+
+# Preserve mentions in the issue content
+keepContentMentions: true
+
+# Move labels that also exist on the target repository
+moveLabels: true
+
+# Set custom aliases for targets
+# aliases:
+# r: repo
+# or: owner/repo
+
+# Repository to extend settings from
+# _extends: repo
\ No newline at end of file
--- /dev/null
+call "C:\Program Files (x86)\Microsoft Visual Studio\2019\Enterprise\VC\Auxiliary\Build\vcvars64.bat"
+
+set CC=cl.exe
+set CXX=cl.exe
+
+meson.exe setup build . --force-fallback-for liblzma -Ddefault_library=static -Dwith_xapian=false -Dzstd:bin_programs=false -Dzstd:bin_tests=false -Dzstd:bin_contrib=false -Dliblzma:default_library=static -Dliblzma:enable_xz=false
+
+cd build
+
+ninja.exe
--- /dev/null
+name: CI
+
+on: [push]
+
+jobs:
+ Macos:
+ strategy:
+ fail-fast: false
+ matrix:
+ target:
+ - native_dyn
+ - iOS_arm64
+ - iOS_i386
+ - iOS_x86_64
+ - iOS_armv7
+ runs-on: macos-10.15
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v1
+ - name: Setup python 3.9
+ uses: actions/setup-python@v1
+ with:
+ python-version: '3.9'
+ - name: Install packages
+ run: |
+ brew update
+ brew install gcovr pkg-config ninja || brew link --overwrite python@3.9
+ - name: Install python modules
+ run: pip3 install meson==0.52.1 pytest
+ - name: Install deps
+ shell: bash
+ run: |
+ ARCHIVE_NAME=deps2_osx_${{matrix.target}}_libzim.tar.xz
+ wget -O- http://tmp.kiwix.org/ci/${ARCHIVE_NAME} | tar -xJ -C $HOME
+ - name: Compile
+ shell: bash
+ run: |
+ MESON_OPTION="--default-library=shared"
+ MESON_CROSSFILE="$HOME/BUILD_${{matrix.target}}/meson_cross_file.txt"
+ if [[ ! "${{matrix.target}}" =~ native_.* ]]; then
+ MESON_OPTION="$MESON_OPTION -Db_bitcode=true --cross-file $MESON_CROSSFILE -Dstatic-linkage=true"
+ cat $MESON_CROSSFILE
+ fi
+ export PKG_CONFIG_PATH=$HOME/BUILD_${{matrix.target}}/INSTALL/lib/pkgconfig
+ meson . build ${MESON_OPTION}
+ cd build
+ ninja
+ - name: Test
+ if: startsWith(matrix.target, 'native_')
+ shell: bash
+ run: |
+ export LD_LIBRARY_PATH=$HOME/BUILD_${{matrix.target}}/INSTALL/lib:$HOME/BUILD_${{matrix.target}}/INSTALL/lib64
+ cd build
+ ninja download_test_data
+ meson test --verbose
+ env:
+ SKIP_BIG_MEMORY_TEST: 1
+
+ Windows:
+ runs-on: windows-latest
+ steps:
+ - name: Checkout code
+ uses: actions/checkout@v1
+ - name: Setup python 3.6
+ uses: actions/setup-python@v1
+ with:
+ python-version: '3.6'
+ - name: Install packages
+ run:
+ choco install ninja
+ - name: Install python modules
+ run: pip3 install meson
+ - name: Compile
+ shell: cmd
+ run: .github\script\build_libzim.cmd
+ - name: Test
+ shell: cmd
+ run: |
+ cd build
+ ninja download_test_data
+ meson test --verbose
+
+ Linux:
+ strategy:
+ fail-fast: false
+ matrix:
+ target:
+ - native_static
+ - native_dyn
+ - android_arm
+ - android_arm64
+ - win32_static
+ - win32_dyn
+ with_xapian:
+ - true
+ - false
+ include:
+ - target: native_static
+ image_variant: xenial
+ lib_postfix: '/x86_64-linux-gnu'
+ - target: native_dyn
+ image_variant: xenial
+ lib_postfix: '/x86_64-linux-gnu'
+ - target: android_arm
+ image_variant: xenial
+ lib_postfix: '/x86_64-linux-gnu'
+ - target: android_arm64
+ image_variant: xenial
+ lib_postfix: '/x86_64-linux-gnu'
+ - target: win32_static
+ image_variant: f31
+ lib_postfix: '64'
+ - target: win32_dyn
+ image_variant: f31
+ lib_postfix: '64'
+ env:
+ HOME: /home/runner
+ runs-on: ubuntu-latest
+ container:
+ image: "kiwix/kiwix-build_ci:${{matrix.image_variant}}-31"
+ steps:
+ - name: Extract branch name
+ shell: bash
+ run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+ id: extract_branch
+ - name: Checkout code
+ shell: python
+ run: |
+ from subprocess import check_call
+ from os import environ
+ config_command = [
+ 'git', 'config', '--global',
+ 'http.postBuffer', '1048576000'
+ ]
+ check_call(config_command, cwd=environ['HOME'])
+ clone_command = [
+ 'git', 'clone',
+ 'https://github.com/${{github.repository}}',
+ '--depth=1',
+ '--branch', '${{steps.extract_branch.outputs.branch}}'
+ ]
+ check_call(clone_command, cwd=environ['HOME'])
+ - name: Install deps
+ shell: bash
+ run: |
+ ARCHIVE_NAME=deps2_${OS_NAME}_${{matrix.target}}_libzim.tar.xz
+ wget -O- http://tmp.kiwix.org/ci/${ARCHIVE_NAME} | tar -xJ -C /home/runner
+ - name: Compile
+ shell: bash
+ run: |
+ if [[ "${{matrix.target}}" =~ .*_dyn ]]; then
+ MESON_OPTION="--default-library=shared"
+ else
+ MESON_OPTION="--default-library=static"
+ fi
+ if [[ "${{matrix.target}}" =~ native_.* ]]; then
+ MESON_OPTION="$MESON_OPTION -Db_coverage=true"
+ else
+ MESON_OPTION="$MESON_OPTION --cross-file $HOME/BUILD_${{matrix.target}}/meson_cross_file.txt"
+ fi
+ if [[ "${{matrix.target}}" =~ android_.* ]]; then
+ MESON_OPTION="$MESON_OPTION -Dandroid=true"
+ fi
+ cd $HOME/libzim
+ meson . build ${MESON_OPTION} -Dwith_xapian=${{matrix.with_xapian}}
+ cd build
+ ninja
+ env:
+ PKG_CONFIG_PATH: "/home/runner/BUILD_${{matrix.target}}/INSTALL/lib/pkgconfig:/home/runner/BUILD_${{matrix.target}}/INSTALL/lib${{matrix.lib_postfix}}/pkgconfig"
+ - name: Test
+ if: startsWith(matrix.target, 'native_')
+ shell: bash
+ run: |
+ cd $HOME/libzim/build
+ ninja download_test_data
+ meson test --verbose
+ ninja coverage
+ env:
+ LD_LIBRARY_PATH: "/home/runner/BUILD_${{matrix.target}}/INSTALL/lib:/home/runner/BUILD_${{matrix.target}}/INSTALL/lib${{matrix.lib_postfix}}"
+ SKIP_BIG_MEMORY_TEST: 1
+ - name: Publish coverage
+ shell: bash
+ run: |
+ cd $HOME/libzim
+ curl https://codecov.io/bash -o codecov.sh
+ bash codecov.sh -n "${OS_NAME}_${{matrix.target}}" -Z
+ rm codecov.sh
+ if: startsWith(matrix.target, 'native_')
+ env:
+ CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
--- /dev/null
+name: Packages
+on: [push, pull_request]
+
+jobs:
+ build-deb:
+ runs-on: ubuntu-latest
+ strategy:
+ fail-fast: false
+ matrix:
+ distro:
+ - debian-unstable
+ - debian-bullseye
+ - debian-buster
+ - ubuntu-jammy
+ - ubuntu-impish
+ - ubuntu-hirsute
+ - ubuntu-focal
+ - ubuntu-bionic
+ steps:
+ - uses: actions/checkout@v2
+
+ # Determine which PPA we should upload to
+ - name: PPA
+ id: ppa
+ run: |
+ if [[ $REF == refs/tags* ]]
+ then
+ echo "::set-output name=ppa::kiwixteam/release"
+ else
+ echo "::set-output name=ppa::kiwixteam/dev"
+ fi
+ env:
+ REF: ${{ github.ref }}
+
+ - uses: legoktm/gh-action-auto-dch@master
+ with:
+ fullname: Kiwix builder
+ email: release+launchpad@kiwix.org
+ distro: ${{ matrix.distro }}
+
+ - uses: legoktm/gh-action-build-deb@debian-unstable
+ if: matrix.distro == 'debian-unstable'
+ name: Build package for debian-unstable
+ id: build-debian-unstable
+ with:
+ args: --no-sign
+
+ - uses: legoktm/gh-action-build-deb@debian-bullseye
+ if: matrix.distro == 'debian-bullseye'
+ name: Build package for debian-bullseye
+ id: build-debian-bullseye
+ with:
+ args: --no-sign
+
+ - uses: legoktm/gh-action-build-deb@debian-buster
+ if: matrix.distro == 'debian-buster'
+ name: Build package for debian-buster
+ id: build-debian-buster
+ with:
+ args: --no-sign
+
+ - uses: legoktm/gh-action-build-deb@ubuntu-jammy
+ if: matrix.distro == 'ubuntu-jammy'
+ name: Build package for ubuntu-jammy
+ id: build-ubuntu-jammy
+ with:
+ args: --no-sign
+ ppa: ${{ steps.ppa.outputs.ppa }}
+
+ - uses: legoktm/gh-action-build-deb@ubuntu-impish
+ if: matrix.distro == 'ubuntu-impish'
+ name: Build package for ubuntu-impish
+ id: build-ubuntu-impish
+ with:
+ args: --no-sign
+ ppa: ${{ steps.ppa.outputs.ppa }}
+
+ - uses: legoktm/gh-action-build-deb@ubuntu-hirsute
+ if: matrix.distro == 'ubuntu-hirsute'
+ name: Build package for ubuntu-hirsute
+ id: build-ubuntu-hirsute
+ with:
+ args: --no-sign
+ ppa: ${{ steps.ppa.outputs.ppa }}
+
+ - uses: legoktm/gh-action-build-deb@ubuntu-focal
+ if: matrix.distro == 'ubuntu-focal'
+ name: Build package for ubuntu-focal
+ id: build-ubuntu-focal
+ with:
+ args: --no-sign
+ ppa: ${{ steps.ppa.outputs.ppa }}
+
+ - uses: legoktm/gh-action-build-deb@ubuntu-bionic
+ if: matrix.distro == 'ubuntu-bionic'
+ name: Build package for ubuntu-bionic
+ id: build-ubuntu-bionic
+ with:
+ args: --no-sign
+ ppa: ${{ steps.ppa.outputs.ppa }}
+
+ - uses: actions/upload-artifact@v2
+ with:
+ name: Packages for ${{ matrix.distro }}
+ path: output
+
+ - uses: legoktm/gh-action-dput@master
+ name: Upload dev package
+ # Only upload on pushes to master
+ if: github.event_name == 'push' && github.event.ref == 'refs/heads/master' && startswith(matrix.distro, 'ubuntu-')
+ with:
+ gpg_key: ${{ secrets.LAUNCHPAD_GPG }}
+ repository: ppa:kiwixteam/dev
+ packages: output/*_source.changes
+
+ - uses: legoktm/gh-action-dput@master
+ name: Upload release package
+ # Only upload on pushes to master or tag
+ if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') && startswith(matrix.distro, 'ubuntu-')
+ with:
+ gpg_key: ${{ secrets.LAUNCHPAD_GPG }}
+ repository: ppa:kiwixteam/release
+ packages: output/*_source.changes
--- /dev/null
+*~
+*#*
+autom4te.cache
+build
+compile
+config.h
+configure
+depcomp
+.deps
+.dirstamp
+INSTALL
+install-sh
+*.kate-swp
+*.la
+.libs
+libtool
+*.lo
+ltmain.sh
+*.m4
+Makefile
+Makefile.in
+missing
+*.o
+stamp-h1
+.svn
+.*.swp
+*.zim
+examples/createZimExample
+src/tools/zimdump
+src/tools/zimsearch
+libzim.pc
+test-driver
+test/zimlib-test*
+test/test-suite.log
+.clangd
--- /dev/null
+Tommi Mäkitalo
+Emmanuel Engelhart
+Kiran Mathew Koshy
+C. Scott Ananian
+Matthieu Gautier
+Kunal Mehta
+Vasudev Kamath
+Nirbheek Chauhan
+joel
+Adrian Kunding
+Vitaly Zaitsev
+Magnus Woodgate
+Renaud Gaudin
+mornfall
+Mohamed Sameh
+Dmitry Atamanov
+MiguelRocha
+Veloman Yunkan
+Gianfranco Costamagna
+Steve Wills
+hashworks
+Maneesh P M
--- /dev/null
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+\f
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+\f
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+\f
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+\f
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
--- /dev/null
+libzim 7.2.0
+============
+
+ * Add methods to get/print (dependences) versions (@kelson42, #452)
+ * Fix Emscripten compilation (@kelson42, @mossroy, #643)
+
+libzim 7.1.0
+============
+
+ * Fix dirent test on 32 bits architectures (@mgautierfr #632)
+ * Fix compilation on Alpine - with musl (@amirouche #649)
+ * Don't crash if ZIM without illustration nor X/W namespace (@mgautierfr #641)
+ * Switch default suggestion operator to AND (@maneeshpm #644)
+ * Add a new method Archive::getMetadataItem (@mgautierfr #639)
+ * Better indexion criterias (@mgautierfr #642)
+ * Avoid duplicated archives in the searcher (@veloman-yunkan #648)
+ * Fix random entry (@veloman-yunkan #650)
+ * Various improvements.
+ - CI @mgautierfr #640, @kelson42 #638, @legoktm #654
+ - Doc @rgaudin #646
+
+libzim 7.0.0
+============
+
+Version 7.0.0 is a major release.
+
+The API has been completely rewritten.
+Most notable change is that namespaces are now hidden.
+The new API is described in documentation, which includes a Transition Guide from v6.
+
+ZIM files created with it uses new ZIM minor version (6.1 - see Header section of spec.)
+Both backward and forward compatibility is kept.
+
+Improvements
+------------
+
+ * Rewrite creator and reader API
+ This removes the namespace from the API. Article are automatically put in
+ the right namespace ('A') and the retrivial of content is made using
+ specific API. (@mgautier #454)
+ * Better handling of the conditional compilation without xapian.
+ Before that, the search API was present (but returning empty result) if
+ libzim was compiled without xapian. Now the API is not present anymore.
+ User code must check if libzim is compiled with xapian or not by checking
+ if LIBZIM_WITH_XAPIAN is defined or not. (@mgautierfr #465)
+ * Add a new specific listing in zim files to list entries considered as "front
+ article". At creation, wrapper MUST pass the hint `FRONT_ARTICLE` to
+ correctly mark the entry. Search by title uses this list if present.
+ (@mgautierfr #487)
+ * Store the wellknown entries in the `W` namespace (`W/mainPage`)
+ (@mgautierfr #497)
+ * Rewrite Search API. Fix potential memory link and allow correct reusing of
+ create search. (@mgautierfr #530)
+ * New suggestion search API. The api mimics the Search API but specialized
+ for suggestion (@maneeshpm #574)
+ * Add `zim::Archive` constructors to open an archive using a existing file descriptor.
+ This API is not available on Windows. (@veloman-yunkan #449)
+ * Make zstd the default compression algorithm (@veloman-yunkan #480)
+ * The method `zim::Archive::checkIntegrity` now if the mimetypes indicated in the
+ dirents are correct (@veloman-yunkan #505)
+ * Writer doesn't add a `.zim` extension to the given path. (@maneeshpm #503)
+ * Implement random entry picking. We are choosing a entry from the "front
+ article" list if present. (@mgautierfr #476)
+ * Creator now create the `M/Counter` metadata. (@mgautierfr)
+ * Better Illustration handling. Favicon is replaced by Illustration.
+ Illustration can now have different size and scale (even if the API do
+ not use this feature) (@mgautierfr #540)
+ * Search iterator now have a method `getZimId` to know the Id of the zim
+ corresponding to the result (useful for multizim search) (@maneeshpm #557)
+
+Bug fixes
+---------
+
+ * The method `zim::Archive::checkIntegrity` now check if the dirents are
+ correctly sorted. (@veloman-yunkan #448)
+ * Handle large MIME-type list. Some zim file may have a pretty large mimetype
+ list. (@veloman-yunkan #460)
+ * Fix handling of zim file containing item of size 0. (@mgautierfr #483)
+ * Better parsing of the entry paths to detect the namespace (@maneeshpm #479)
+ * Fix zim file creation on Windows (@mgautierfr #508)
+ * Better algorithm tunning for suggestion search (@maneeshpm #492)
+ * The default indexer now index html content only. (@mgautierfr #511)
+ * Better suggestion search : Don't use stopwords, use OP_PHRASE
+ (@maneeshpm #501)
+ * Remove duplicate in the suggestion search (@maneeshpm #515)
+ * Remove the termlist from the xapian database, lower memory usage
+ (@maneeshpm #528)
+ * Add a anchor in the suggestion search to search term at the beginning of
+ the title (@maneeshpm #526)
+ * Make the suggestion search working with special characters (`&`, `+`)
+ (@veloman-yunkan #534)
+ * Fix creator issue not detecting that cluster must be extended if it
+ contains only 32-bit-sized content. (@veloman-yunkan #552)
+ * Correctly generate suggestion snippet. (@maneeshpm #545)
+ * Better cluster size configuration (@mgautierfr #555)
+ * Make search iterator `getTitle` return the real title of the entry and not
+ the one stored in the xapian database (caseless) (@maneeshpm #586)
+ * Correcly close a zim creator to avoid a crash when the creator is
+ destructed without being started (@mgautierfr #613)
+ * Reduce the creator memory usage by reducing the memory size of the dirent
+ (@mgautier #616, #628)
+ * Write the cluster using a bigger chunk size for performance
+ (@mgautierfr #506)
+ * Change the default cluster size to 2MiB (@mgautierfr #585)
+ * The default mimetype for metadata now include the utf8 chardet
+ (@rgaudin #626)
+ * Improve the estimation of the number of search/suggestion results by forcing
+ Xapian to evaluate at least 10 results (@mgautier #625)
+
+Other
+-----
+
+ * Update xapian stopwords list. (@data-man #447)
+ * Remove direct pthread dependency (use c++11 thread library). (@mgautierfr #443)
+ We still need pthread library on linux and freebsd as C++11 is using it internally.
+ * [CI] Make the libzim CI compile libzim natively on Windows (@mgautierfr #453).
+ * [CI] Build libzim package for Ubuntu Hirsute and Impish
+ (@legoktm #459, #580)
+ * Always create zim file using the major version 6. (@mgautierfr #512)
+ * Move the test data files out of the git repository. Now test files are
+ stored in `zim-testing-suite` repository and must be downloaded.
+ (@mgautierfr #538, #535)
+ * Add search iterator unit test (@maneeshpm #547)
+ * Correctly fix search iterator method case to use camelCase everywhere
+ (@maneeshpm #563)
+ * Add a cast to string opertor on Uuid (@maneeshpm #582)
+ * Make unittest print the path of the missing zim file when something goes
+ wrong (@kelson42 #601)
+ * Delete temporary data (index) after we called `finishZimCreation` instead of
+ waiting for creator destruction. (@mgautierfr #603)
+ * Add basic user documentation (@mgautierfr #611)
+
+Known bugs
+----------
+
+Suggestion system using in current libkiwix doesn't work with new zim files
+created with this release (and future ones).
+New libkiwix version will be fixed and will work with new and old zim files.
+
+
+libzim 6.3.2
+============
+
+This is a hotfix of 6.3.0 :
+ * libzim now create zimfile with zstd compression 19 instead of 22.
+ So new libzim do not need to allocate 128Mb per cluster at decompression
+ time.
+ * At reading time, on 32 bits architectures, zstd cluster are not keep in
+ cache. This avoid use to also keep the decompression stream which reserve
+ 128Mb of memory address.
+
+libzim 6.3.1
+============
+
+The release process of 6.3.1 was buggy. So, no 6.3.1.
+
+
+libzim 6.3.0
+============
+
+ * Rewrite internal reader structure to use stream decompression.
+ This allow libzim to not decompresse the whole cluster to get an article
+ content. This is big performance improvement, it speedups random access by
+ 2, with a very small cost when doing "full" incremental reading
+ (zim-check/zim-dump). (@veloman-yunkan)
+ * Better dirent lookup.
+ Dirent lookup is the process of locating article data starting from the url
+ or title. This improves reading of zim file up to 10% (@veloman-yunkan)
+ * Add basic, first version of `validate` function to check internal structure
+ of a zim file. (@veloman-yunkan, @MiguelRocha)
+ * Fix compilation of libzim without xapian (@mgautierfr)
+ * Remove zlib dependency (and support of very old files created using zlib
+ compression) (@mgautierfr)
+ * New unit tests and various small fixes.
+
+
+libzim 6.2.2
+============
+
+ * Check blob index before access it in the cluster.
+ * Refactoring of the cluster reading.
+
+libzim 6.2.1 (release process broken)
+=====================================
+
+ * Update readme and add link to repology.org packages list.
+ * Fix compilation on windows.
+
+libzim 6.2.0
+============
+
+ * Fix compilation of libzim on freebsd.
+ * Rewrite unit tests to remove python based test and use gtest all the time.
+ * Make libzstd mandatory.
+ * Support for meson 0.45.
+ * Fix multipart support on macos.
+ * Add a documentation system.
+ * Better cache system implementation (huge speed up).
+ * Various (and numerous) small refactoring.
+
+
+libzim 6.1.8
+============
+
+ * Increase default timeout for test to 120 seconds/test
+ * Compression algorithm to use can be passed to `zim::writer::Creator`
+ * Add automatic debian packaging of libzim.
+ * Fix using of tmpdir (and now use env var TMPDIR) during tests.
+
+
+libzim 6.1.7
+============
+
+ * Do not assume urlPtrPos is just after the mimetype list.
+ * Fix compilation of compression test.
+ * Do not exit but throw an exception if an ASSERT is not fulfill.
+
+libzim 6.1.6
+============
+
+ * Better (faster) implementation of the ordering of article by cluster.
+ * Fix compression algorithm.
+
+libzim 6.1.5
+============
+
+ * [Writer] Remove unused declaration of classes.
+ Those classes were not implemented nor used at all.
+
+libzim 6.1.4
+============
+
+ * [Writer] Fix excessive memory usage. Data of the cluster were clean at the
+ end of the process, not once we don't need it.
+
+libzim 6.1.3
+============
+
+ * [Writer] Use a `.tmp` suffix and rename to `.zim` at the end of the write
+ proces.
+ * Add unit tests
+ * Do not include uncessary `windows.h` headers in public zim's headers.
+
+libzim 6.1.2
+============
+
+ * [CI] Fix codecov configuration
+ * [Writer] Fix threads synchronization at end of writing process.
+
+libzim 6.1.1
+============
+
+ * Fix bug around the find function
+
+libzim 6.1.0
+============
+
+ * Compile now on OpenBSD
+ * [Test] Use the main function provided by gtest.
+ * [CI] Move the CI compilation to github actions.
+ * Add stopwords for 54 new languages.
+ * [Writer] Improve the way we are writing cluster at zim creation time.
+ - Clusters are directly written in the zim file instead of using temporary
+ files.
+ - mimetypes are limited to 944 bytes.
+ * Add a new type of iterator to iterate over articles in a performant way
+ reducing decompression of clusters. This is now the new default iterator.
+ * Add support for zim files compressed with zstd compression algorithm.
+ This is not possible to use zstd to create zim file for now.
+
+libzim 6.0.2
+============
+
+ * Fix search suggestion parsing.
+
+libzim 6.0.1
+============
+
+ * Fix crash when trying to open an empty file.
+ * Ensure that pytest tests are run on the CI.
+
+libzim 6.0.0
+============
+
+ * [Writer] Index the articles in differents threads. This is a huge speed
+ improvement as the main thread in not blocked by indexing.
+ * Index the title only if `shouldIndex` return true.
+
+libzim 5.1.0
+============
+
+ * Improve indexation of the title.
+ * Better pertinence of suggestions (only for new zim files)
+ * Improvement of the speed of Leveinstein distance for suggestions (for old
+ zims)
+
+libzim 5.0.2
+============
+
+ * Improve README.
+ * Remove gtest as embeded subproject.
+ * Better lzma compression.
+ * Better performance of the leveinstein algorithm (better suggestions
+ performance)
+
+libzim 5.0.1
+============
+
+ * Update README.
+ * [Writer] Add debug information (print progress of the clusters writing).
+ * [Writer] Correctly print the url to the user.
+ * [CI] Add code coverage.
+
+libzim 5.0.0
+============
+
+ * Fix thread slipping for win32 crosscompilation.
+ * Fix a potential invalid access when reading dirent.
+ * Fix memory leak in the decompression algorithm.
+ * [Writer] Fix a memory leak (cluster cleanning)
+ * [Writer] Write article data in a temporary cluster file instead of a
+ temporary file per article.
+ * [Writer] Better algorithm to store the dirent while creating the zim
+ file. Better memory usage.
+ * [Writer] [API Change] Url/Ns are now handle using the same struct Url.
+ * [Writer] [API Change] No more aid and redirectAid. A redirectArticle
+ have to implement redirectUrl.
+ * [Writer] Use a memory pool to avoid multiple small memory allocations.
+ * [Writer] [API Change] Rename `ZimCreator` to `Creator`.
+ * [API Change] File's `search` and `suggestions` now return a unique_ptr
+ instead of a raw pointer.
+
+libzim 4.0.7
+============
+
+ * Build libzim without rpath.
+
+libzim 4.0.6
+============
+
+ * Support zim file created with cluster not written sequentially.
+ * Remove a meson warning.
+
+libzim 4.0.5
+============
+
+ * Store the xapian database in the right url.
+ * Do not fail when reading very small zim file (<256b).
+ * Do not print message on normal behavior.
+ * [BUILDSYSTEM] Be able to build a dynamic lib (libzim.so) but using static
+ dependencies.
+ * [CI] Use last version of meson.
+ * [CI] Use the new deps archive xz
+
+libzim 4.0.4
+============
+
+ * Fix opening of multi-part zim.
+ * Fix convertion of path to wpath on Windows.
+
+libzim 4.0.3
+============
+
+ * Implement low level file manipilation using different backends
+
+libzim 4.0.2
+============
+
+ * [Windows] Fix opening of zim file bigger than 4GiB
+
+libzim 4.0.1
+============
+
+ * [Writer] Fix wrong redirectyon log message
+ * Make libzim compile natively on windows using MSVC
+ * Better message when failing to read a zim file.
+ * Make libzim on windows correctly open unicode path.
+ * Add compilation option to use less memory (but more I/O).
+ Usefull on low memory devices (android)
+ * Small fixes
+
+libzim 4.0.0
+============
+
+ * [Writer] Remove a lot of memory copy.
+ * [Writer] Add xapian indexing directly in libzim.
+ * [Writer] Better API.
+ * [Writer] Use multi-threading to write clusters.
+ * [Writer] Ensure mimetype of articles article is not null.
+ * Extend test timeout for cluster's test.
+ * Less memory copy for cluster's test.
+ * Allow skipping test using a lot memory using env variable
+ `SKIP_BIG_MEMORY_TEST=1`
+ * Explicitly use the icu namespace to allow using of packaged icu lib.
+ * Use a temporary file name as long as the ZIM writting process is
+ not finished (#163)
+ * [Travis] Do no compile using gcc-5 (but the default trusty's one 4.8)
+
+libzim 3.3.0
+============
+
+ * Fix handling of big cluster (>4GiB) on 32 bits architecture. This is mainly
+ done by :
+ * Do not mmap the whole cluster by default.
+ * MMap only the memory asociated to an article.
+ * If an article is > 4GiB, the blob associated to it is invalid
+ (data==size==0).
+ * Other information are still valid (directAccessInformation, ...)
+ * Fix writing of extended cluster in writer.
+ * Compile libzim on macos.
+ * Build libzim setting RPATH.
+ * Search result urls are now what is stored in the zim file. They should not
+ start with a `/`. This is a revert of the change made in last release.
+ (See kiwix/kiwix-lib#123)
+ * Spelling corrections in README.
+
+libzim 3.2.0
+============
+
+ * Support geo query if the xapian database has indexed localisation.
+ * Handle articles bigger than 4Go in the zim file (#110).
+ * Use AND operator between search term.
+ * Fix compilation with recent clang (#95).
+ * Add method to get article's data localisation in the zim file.
+ * Be able to get only a part of article (#77).
+ * Do not crash if we cannot open the xapian Database for some reasons.
+ (kiwix/kiwix-tools#153)
+ * Do not assumen there is always a checksum in the zim file.
+ (kiwix/kiwix-tools#150)
+ * Try to do some sanity checks when opening a zim file.
+ * Use pytest to do some tests (when cython is available).
+ * Use levenshtein distance to sort and have better suggestion results.
+ * Search result urls are now always absolute (starts with a '/').
+ (kiwix/kiwix-lib#110)
+ * Open the file readonly when checking the zim file (and so be able to check
+ read only file).
+ * Accept absolute url starting with '/' when searching for article.
+ * Fix various bugs
+
+libzim 3.1.0
+============
+
+ * Lzma is not a optional dependency anymore.
+ * Better handle (report and not crash) invalid zim file.
+ * Embed source of gtest (used only if gtest is not available on the system)
+ * Move zimDump tools out of libzim repository to zim-tools
+ * ZimCreator tools doesn't not read command line to set options.
+
+libzim 3.0.0
+============
+
+This is a major change of the libzim.
+Expect a lot new improvement and API changes.
+
+ * Add a suggestion mode to the search
+ * Fix licensing issues
+ * Fix wrong stemming of the query when searching
+ * Deactivate searching (and so crash) in the embedded database if the zim is
+ splitted
+ * Rewrite the low level memory management of libzim when reading a zim file:
+ * We use a buffer base entity to handle memory and reading file instead of
+ reading file using stream.
+ * MMap the memory when posible to avoid memory copy.
+ * Use const when posible (API break)
+ * Move to googletest instead of cxxtools for unit-tests.
+ * Fix endiannes bug on arm.
+ * Do not install private headers. Those headers declare private structure and
+ should not be visible (API break)
+ * Compile libzim with `-Werror` and `-Wall` options.
+ * Make libzim thread safe for reading article.
+ The search part is not thread safe, and all search operation must be
+ protected by a lock.
+ * Add method to get only a part of a article.
+ * Move some tools to zim-tools repository.
+
+
+libzim 2.0.0
+============
+
+ * Move to meson build system
+ `libzim` now use `meson` as build system instead of `autotools`
+ * Move to C++11 standard.
+ * Fulltext search in zim file.
+ We have integrated the xapian fulltext search in libzim.
+ So now, libzim provide an API to search in a zim containing embeded fulltext
+ index. This means that :
+ *libzim need xapian as (optional) dependencies (if you want compile with
+ xapian support).
+ * The old and unused search API has been removed.
+ * Remove bzip2 support.
+ * Remove Symbian support.
+ * Few API hanges
+ * Make some header files private (not installed);
+ * A `Blob` can now be cast to a `string` directly;
+ * Change a lot of `File` methods to const methods.
--- /dev/null
+Libzim
+======
+
+The Libzim is the reference implementation for the [ZIM file
+format](https://wiki.openzim.org/wiki/ZIM_file_format). It's a [software
+library](https://en.wikipedia.org/wiki/Library_(computing)) to read
+and write ZIM files on many systems and architectures. More
+information about the ZIM format and the openZIM project at
+https://openzim.org/.
+
+[](https://download.openzim.org/release/libzim/)
+[](https://github.com/openzim/libzim/wiki/Repology)
+[](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html)
+[](https://github.com/openzim/libzim/actions?query=branch%3Amaster)
+[](https://libzim.readthedocs.io/en/latest/?badge=latest)
+[](https://codecov.io/gh/openzim/libzim)
+[](https://www.codefactor.io/repository/github/openzim/libzim)
+
+Disclaimer
+----------
+
+This document assumes you have a little knowledge about software
+compilation. If you experience difficulties with the dependencies or
+with the Libzim compilation itself, we recommend to have a look to
+[kiwix-build](https://github.com/kiwix/kiwix-build).
+
+Preamble
+--------
+
+Although the Libzim can be compiled/cross-compiled on/for many
+systems, the following documentation explains how to do it on POSIX
+ones. It is primarily though for GNU/Linux systems and has been tested
+on recent releases of Ubuntu and Fedora.
+
+Dependencies
+------------
+
+The Libzim relies on many third party software libraries. They are
+prerequisites to the Kiwix library compilation. Following libraries
+need to be available:
+* [LZMA](https://tukaani.org/lzma/) (package `liblzma-dev` on Ubuntu)
+* [ICU](http://site.icu-project.org/) (package `libicu-dev` on Ubuntu)
+* [Zstd](https://facebook.github.io/zstd/) (package `libzstd-dev` on Ubuntu)
+* [Xapian](https://xapian.org/) - optional (package `libxapian-dev` on Ubuntu)
+* [UUID](http://e2fsprogs.sourceforge.net/) (package `uuid-dev` on Ubuntu)
+
+To test the code:
+* [Google Test](https://github.com/google/googletest) (package `googletest` on Ubuntu)
+* [ZIM Testing Suite](https://github.com/openzim/zim-testing-suite) - Reference test data set
+
+To build the documentations you need the packages:
+* [Doxygen](https://www.doxygen.nl)
+* Python packages for [Sphinx](https://www.sphinx-doc.org), [Breathe](https://breathe.readthedocs.io) and [Exhale](https://exhale.readthedocs.io)
+
+These dependencies may or may not be packaged by your operating
+system. They may also be packaged but only in an older version. The
+compilation script will tell you if one of them is missing or too old.
+In the worse case, you will have to download and compile a more recent
+version by hand.
+
+If you want to install these dependencies locally, then ensure that
+Meson (through `pkg-config`) will properly find them.
+
+Environment
+-------------
+
+The Libzim builds using [Meson](https://mesonbuild.com/) version
+0.43 or higher. Meson relies itself on Ninja, Pkg-config and few other
+compilation tools. Install them first:
+* Meson
+* Ninja
+* Pkg-config
+
+These tools should be packaged if you use a cutting edge operating
+system. If not, have a look to the [Troubleshooting](#Troubleshooting)
+section.
+
+Compilation
+-----------
+
+Once all dependencies are installed, you can compile Libzim with:
+```bash
+meson . build
+ninja -C build
+```
+
+By default, it will compile dynamic linked libraries. All binary files
+will be created in the `build` directory created automatically by
+Meson. If you want statically linked libraries, you can add
+`--default-library=static` option to the Meson command.
+
+If you want to build the documentation, we need to pass the
+`-Ddoc=true` option and run the `doc` target:
+```bash
+meson . build -Ddoc=true
+ninja -C build doc
+```
+
+Depending on your system, `ninja` command may be called `ninja-build`.
+
+By default, Libzim tries to compile with Xapian (and will generate an
+error if Xapian is not found). You can build without Xapian by
+passing the option `-Dwith_xapian=false` :
+```bash
+meson . build -Dwith_xapian=false
+ninja -C build doc
+```
+
+If Libzim is compiled without Xapian, all search API are removed. You
+can test if an installed version of Libzim is compiled with or without
+xapian by testing the define `LIBZIM_WITH_XAPIAN`.
+
+Testing
+-------
+
+ZIM files needed by unit-tests are not included in this repository. By
+default, Meson will use an internal directory in your build directory,
+but you can specify another directory with option `test_data_dir`:
+```bash
+meson . build -Dtest_data_dir=<A_DIR_WITH_TEST_DATA>
+```
+
+Whatever you specify a directory or not, you need a extra step to
+download the data. At choice:
+* Get the data from the repository
+ [openzim/zim-testing-suite](https://github.com/openzim/zim-testing-suite)
+ and put it yourself in the directory.
+* Use the script
+ [download_test_data.py](scripts/download_test_data.py) which will
+ download and extract the data for you.
+* As `ninja` to do it for you with `ninja download_test_data` once the
+ project is configured.
+
+The simple workflow is:
+```bash
+meson . build # Configure the project (using default directory for test data)
+cd build
+ninja # Build
+ninja download_test_data # Download the test data
+meson test # Test
+```
+
+It is possible to deactivate all tests using test data zim files by
+passing `none` to the `test_data_dir` option:
+```bash
+meson . build -Dtest_data_dir=none
+cd build
+ninja
+meson test # Run tests but tests needing test zim files.
+```
+
+If the automated tests fail or timeout, you need to be aware that some
+tests need up to 16GB of memory. You can skip those specific tests with:
+```bash
+SKIP_BIG_MEMORY_TEST=1 meson test
+```
+
+Installation
+------------
+
+If you want to install the Libzim and the headers you just have
+compiled on your system, here we go:
+```bash
+ninja -C build install
+```
+
+You might need to run the command as root (or using `sudo`), depending
+where you want to install the libraries. After the installation
+succeeded, you may need to run ldconfig (as root).
+
+Uninstallation
+------------
+
+If you want to uninstall the Libzim:
+```bash
+ninja -C build uninstall
+```
+
+Like for the installation, you might need to run the command as root
+(or using `sudo`).
+
+Troubleshooting
+---------------
+
+If you need to install Meson "manually":
+```bash
+virtualenv -p python3 ./ # Create virtualenv
+source bin/activate # Activate the virtualenv
+pip3 install meson # Install Meson
+hash -r # Refresh bash paths
+```
+
+If you need to install Ninja "manually":
+```bash
+git clone git://github.com/ninja-build/ninja.git
+cd ninja
+git checkout release
+./configure.py --bootstrap
+mkdir ../bin
+cp ninja ../bin
+cd ..
+```
+
+If the compilation still fails, you might need to get a more recent
+version of a dependency than the one packaged by your Linux
+distribution. Try then with a source tarball distributed by the
+problematic upstream project or even directly from the source code
+repository.
+
+License
+-------
+
+[GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) or
+later, see [COPYING](COPYING) for more details.
--- /dev/null
+libzim (0.0.0) unstable; urgency=medium
+
+ * Initial release.
+
+ -- Kunal Mehta <legoktm@debian.org> Tue, 02 Jun 2020 01:49:48 -0700
--- /dev/null
+Source: libzim
+Section: libs
+Priority: optional
+Build-Depends: debhelper-compat (= 13),
+ liblzma-dev,
+ libicu-dev,
+ libxapian-dev,
+ libzstd-dev,
+ uuid-dev,
+ libgtest-dev,
+ meson,
+ ninja-build,
+ pkg-config
+Maintainer: Kiwix team <kiwix@kiwix.org>
+Homepage: https://www.openzim.org/wiki/Libzim
+Standards-Version: 4.4.1
+Rules-Requires-Root: no
+
+Package: libzim7
+Architecture: any
+Multi-Arch: same
+Depends: ${misc:Depends},
+ ${shlibs:Depends}
+Pre-Depends: ${misc:Pre-Depends}
+Conflicts: libzim0, libzim0v5, libzim2, libzim4, libzim5
+Replaces: libzim0, libzim0v5, libzim2, libzim4, libzim5
+Description: library implementation of ZIM specifications
+ ZIM (Zeno IMproved) is an open file format for storing the contents of
+ wiki for offline usage. This file format is primarily focused on
+ providing the contents of Wikipedia and Wikimedia projects for offline
+ use.
+ .
+ libzim is the standard implementation of ZIM specification, which
+ implements the read and write method for ZIM files.
+ .
+ ZIM is a file format created with focus on extracting and encoding data
+ from Mediawiki for offline use.
+ .
+ Features of libzim are:
+ * Native, coded in C++
+ * Extremely fast
+ * Minimal footprint
+ * Minimal dependencies
+ * Portable on most OS (Windows, Linux, Mac OS X)
+
+Package: libzim-dev
+Section: libdevel
+Architecture: any
+Depends: ${misc:Depends},
+ libzim7 (= ${binary:Version}),
+ liblzma-dev,
+ libxapian-dev,
+ libicu-dev,
+ libzstd-dev
+Description: library implementation of ZIM specifications (development)
+ ZIM (Zeno IMproved) is an open file format for storing the contents of
+ wiki for offline usage. This file format is primarily focused on
+ providing the contents of Wikipedia and Wikimedia projects for offline
+ use.
+ .
+ libzim is the standard implementation of ZIM specification, which
+ implements the read and write method for ZIM files.
+ .
+ ZIM is a file format created with focus on extracting and encoding data
+ from Mediawiki for offline use.
+ .
+ This package contains development files.
--- /dev/null
+See COPYING in the repository root.
--- /dev/null
+usr/include/*
+usr/lib/*/libzim.so
+usr/lib/*/pkgconfig/*
\ No newline at end of file
--- /dev/null
+usr/lib/*/*.so.*
\ No newline at end of file
--- /dev/null
+#!/usr/bin/make -f
+export DEB_BUILD_MAINT_OPTIONS = hardening=+all
+
+# Skip some extremely memory-intensive tests
+export SKIP_BIG_MEMORY_TEST=1
+%:
+ dh $@ --buildsystem=meson
+
+# Skip tests that require zim-testing-data for now
+override_dh_auto_configure:
+ dh_auto_configure -- -Dtest_data_dir=none
+
+# Increase test timeout
+override_dh_auto_test:
+ dh_auto_test -- -t 3
--- /dev/null
+3.0 (native)
--- /dev/null
+
+Libzim 7 transition guide
+=========================
+
+
+Libzim7 change a lot of things in the API and in the way we use namespaces (reflected in the API changes).
+This part is a document helping to do the transition from libzim6 to libzim7.
+
+Namespace handling
+------------------
+
+In libzim6 namespaces were exposed to the user. It was to the user to handle them correctly.
+Libzim6 was not doing any assumption about the namespaces.
+However, the usage (mainly from libkiwix) was to store metadata in ``M`` namespace, articles in ``A`` and image/video in ``I``.
+
+On the opposite side, libzim7 hides the concept of namespace and handle it for the user.
+While namespaces are still present and used in the zim format, they have vanished from the libzim api.
+For information (but it is not important to use libzim), we now store all "user content" in ``C`` namespace.
+Metadata are stored in ``M`` namespace and we use few other (``X``, ``W``) for some internal content.
+
+"User content" are accessed using "classic" method to get content.
+Metadata, illustration and such are accessed using specific method.
+
+An article stored in ``A`` namespace before ("A/index.html") is now accessed simply using "index.html".
+(It is stored in "C/index.html" in new format, but you must not specify the namespace in the new api).
+
+Compatibility
+-------------
+
+libzim6 is agnostic about the namespaces. They are exposed to the user, whatever if we are
+reading a new or old zim file. It is up to the user to correctly handle namespaces
+(mainly, content are now in ``C`` instead of ``A``/``I``).
+
+libzim7 tries to be smart about the transition. It will look in the right namespace, depending
+of the zim file.
+Accessing "index.html" should work whatever if we use old or new namespace scheme.
+
+Accessing article/entry
+-----------------------
+
+Getting one entry
+.................
+
+
+In libzim6 accessing an ``Article`` was done using a ``File`` instance.
+You then had to check for the `Article` validity before using it.
+
+ .. code-block:: c++
+
+ auto f = zim::File("foo.zim");
+ auto a = f.getArticleByUrl("A/index.html");
+ if (!a.good()) {
+ std::cerr << "No article "A/index.html" << std::endl;
+ }
+
+In libzim7 you access a |Entry| using a |Archive| instance.
+If there the entry is not found, a exception is raised.
+
+ .. code-block:: c++
+
+ auto a = zim::Archive("foo.zim");
+ try {
+ auto e = a.getEntryByPath("index.html");
+ } catch (zim::EntryNotFound& e) {
+ std::cerr << "No entry "index.html" << std::endl;
+ }
+
+
+Redirection
+...........
+
+
+Article in libzim6 may be a redirection to another article or a article containing data.
+You had to check the kind of the article before using the right set of method.
+Using a method on a wrong kind was undefined behavior.
+
+ .. code-block:: c++
+
+ auto article = [...];
+ if (article.isRedirect()) {
+ auto target = article.getRedirectArticle();
+ } else {
+ auto blob = article.getData();
+ }
+
+
+In libzim7, |Entry| is a kind of intermediate structure, either redirecting to another entry or a item.
+A |Item| is the structure containing the data.
+
+ .. code-block:: c++
+
+ auto entry = [...];
+ if (entry.isRedirect()) {
+ auto target = entry.getRedirectEntry();
+ } else {
+ auto item = entry.getItem();
+ auto blob = item.getData();
+ }
+
+
+As a common usage is to get the item associated to the entry while resolving the redirection chain,
+it is possible to do this easily :
+
+.. code-block:: c++
+
+ auto entry = [...];
+ // Resolve any redirection chain and return the final item.
+ auto item = entry.getItem(true);
+ auto blob = item.getData()
+
+Iteration
+.........
+
+To iterate on article with libzim6 you had to use the ``begin*`` method to get a iterator.
+You may iterate until ``end()`` was reached.
+
+ .. code-block:: c++
+
+ auto file = [...];
+ for(auto it = file.beginByUrl(); it!=file.end(); it++) {
+ auto article = *it;
+ [...]
+ }
+
+
+If you wanted to iterate on article starting by a url prefix it was a bit more complex :
+
+ .. code-block:: c++
+
+ auto file = [...];
+ auto it = file.find("A/ind");
+ while(!it.is_end() && it->getUrl().startWith("A/ind")) {
+ auto article = *it;
+ [...]
+ it++;
+ }
+
+
+In libzim7 you get |EntryRange| on which you can easily iterate on:
+
+ .. code-block:: c++
+
+ auto archive = [...];
+ for(auto entry : archive.iterByPath()) {
+ [...]
+ }
+
+ .. code-block:: c++
+
+ auto archive = [...];
+ for(auto entry : archive.findByPath("ind")) {
+ [...]
+ }
+
+Searching
+---------
+
+In libzim6 searching was made the only class ``Search``
+
+ .. code-block:: c++
+
+ auto f = zim::File("foo.zim");
+ auto search = zim::Search(&f);
+ search.set_query("bar");
+ search.set_range(10, 30);
+ for (auto it =search.begin(); it!=search.end(); it++)
+ {
+ std::cout << "Found result " << it.get_url() << std::endl;
+ }
+
+In libzim7 you search starting from a |Searcher|.
+
+ .. code-block:: c++
+
+ // Create a searcher, something to search on an archive
+ zim::Searcher searcher(archive);
+
+ // We need a query to specify what to search for
+ zim::Query query;
+ query.setQuery("bar");
+
+ // Create a search for the specified query
+ zim::Search search = searcher.search(query);
+
+ // Now we can get some result from the search.
+ // 20 results starting from offset 10 (from 10 to 30)
+ zim::SearchResultSet results = search.getResults(10, 20);
+
+ // SearchResultSet is iterable
+ for(auto entry: results) {
+ std::cout << entry.getPath() << std::endl;
+ }
+
+While it may seems a bit more complex (and it is), it has the main advantage to allow
+reusing of the different instance :
+
+- |Searcher| is what we are searching on, we can do several search on it without recreating a internal xapian database.
+- |Query| is what we are searching for.
+- |Search| is a specific search.
+- |SearchResultSet| is a set of result for a |Search|, it allow getting particular results without having to search several times.
+
+Suggestion
+----------
+
+In libzim6 suggestion was made using the same class ``Search`` but by setting the suggestion mode before
+iterating on the results.
+
+ .. code-block:: c++
+
+ auto f = zim::File("foo.zim");
+ auto search = zim::Search(&f);
+ search.set_query("bar");
+ search.set_range(10, 30);
+ search.set_suggestion_mode(true); // <<<
+ for (auto it =search.begin(); it!=search.end(); it++)
+ {
+ std::cout << "Found result " << it.get_url() << std::endl;
+ }
+
+If the zim file had no suggestion database, the suggestion search was made on full text database
+(with variable results).
+
+In libzim7 you do suggestion using |SuggestionSearcher| API :
+
+ .. code-block:: c++
+
+ // Create a searcher, something to search on an archive
+ zim::SuggestionSearcher searcher(archive);
+
+ // Create a search for the specified query
+ zim::SuggestionSearch search = searcher.search("bar");
+
+ // Now we can get some result from the search.
+ // 20 results starting from offset 10 (from 10 to 30)
+ zim::SuggestionResultSet results = search.getResults(10, 20);
+
+ // SearchResultSet is iterable
+ for(auto entry: results) {
+ std::cout << entry.getPath() << std::endl;
+ }
+
+
+Creating a zim file
+-------------------
+
+Creating a zim file with libzim6 was pretty complex.
+One had to inherit the ``zim::writer::Creator`` to provide the main url.
+Then it had to inherit from ``zim::writer::Article`` to be able to add different kind of article to the zim file.
+
+ .. code-block:: c++
+
+ class MyCreator: public zim::writer::Creator {
+ Url getMainUrl() const { return Url('A', "index.html"); }
+ };
+
+ class RedirectArticle : public zim::writer::Article {
+ public:
+ RedirectArticle(const std::string& title, const std::string& url, const std::string& target)
+ : title(title),
+ url(url),
+ target(target)
+ {}
+
+ bool isRedirect() const { return true; }
+ zim::writer::Url getUrl() const { return url; }
+ std::string getTitle() const { return title; }
+ zim::writer::Url getRedirectUrl() const { return target; }
+
+ private:
+ std::string title;
+ std::string url;
+ std::string target;
+ };
+
+ class ContentArticle: public zim::writer::Article {
+ ContentArticle(const std::string& title, const std::string& url, const std::string& mimetype, const std::string& content)
+ : title(title),
+ url(url),
+ mimetype(mimetype),
+ content(content)
+ {}
+
+ bool isRedirect() const { return false; }
+ zim::writer::Url getUrl() const { return url; }
+ std::string getTitle() const { return title; }
+ std::string getMimeType() const { return mimetype; }
+ Blob getData() const { return Blob(content.data(), content.size()); }
+ private:
+ std::string title;
+ std::string url;
+ std::string mimetype;
+ std::string content;
+ };
+
+ int main() {
+ MyCreator creator();
+ creator.startZimCreation("out_file.zim");
+ std::shared_ptr<zim::writer::Article> article = std::make_shared<ContentArticle>("A article", "A/article", "text/html", "A content");
+ creator.addArticle(article);
+ std::shared_ptr<zim::writer::Article> redirect = std::make_shared<RedirectArticle>("A redirect", "A/redirect", "A/article");
+ creator.addArticle(redirect);
+ creator.finishZimCreation();
+ }
+
+On libzim7, you don't have to inherit the |Creator|.
+Redirect and metadata are added using |addRedirection| and |addMetadata|.
+You still may have to inherit |WriterItem| but default implementation
+are provided (|StringItem|, |FileItem|).
+
+ .. code-block:: c++
+
+ int main() {
+ zim::writer::Creator creator;
+ creator.startZimCreation();
+ creator.addRedirection("A/redirect", "A redirect", "A/article");
+ std::shared_ptr<zim::writer::Item> item = std::make_shared<StringItem>("article", "text/html", "A article", {}, "A content");
+ creator.addItem(item);
+ creator.finishZimCreation();
+ }
+
+Metadata and Illustration
+.........................
+
+Metadata are adding using |addMetadata|.
+You don't have to create a specific item in ``M`` namespace.
+
+The creator now create the ``M/Counter`` metadata for you. You don't have (and must not) add a ``M/Counter`` yourself.
+
+Favicon has been deprecated in favor of Illustration.
+In libzim6, you had to add a file in ``I`` namespace and add a ``-/favicon`` redirection to the file.
+In libzim7, you have to use the |addIllustration| method.
+
+
+Hints
+.....
+
+Hints are a new concept in libzim7.
+This is a generic way to pass information to the creator about how to handle item/redirection.
+
+An almost mandatory hint to pass is the hint ``FRONT_ARTICLE`` (|HintKeys|).
+``FRONT_ARTICLE`` mark entry (item or redirection) as main article for the reader
+(typically a html page in opposition to a resource file as css, js, ...).
+Random and suggestion feature will search only in entries marked as ``FRONT_ARTICLE``.
+If no entry are marked as ``FRONT_ARTICLE``, all entries will be used.
+
+ .. Declare some replacement helpers
+
+ .. |Archive| replace:: :class:`zim::Archive`
+ .. |EntryRange| replace:: :class:`zim::Archive::EntryRange`
+ .. |Entry| replace:: :class:`zim::Entry`
+ .. |Item| replace:: :class:`zim::Item`
+ .. |EntryNotFound| replace:: :class:`zim::EntryNotFound`
+ .. |Searcher| replace:: :class:`zim::Searcher`
+ .. |Search| replace:: :class:`zim::Search`
+ .. |Query| replace:: :class:`zim::Query`
+ .. |SearchResultSet| replace:: :class:`zim::SearchResultSet`
+ .. |SuggestionSearcher| replace:: :class:`zim::SuggestionSearcher`
+ .. |getEntryByPath| replace:: :func:`getEntryByPath<void zim::Archive::getEntryByPath(const std::string&) const>`
+ .. |getEntryByTitle| replace:: :func:`getEntryByTitle<void zim::Archive::getEntryByTitle(const std::string&) const>`
+ .. |findByPath| replace:: :func:`findByPath<zim::Archive::findByPath>`
+ .. |findByTitle| replace:: :func:`findByTitle<zim::Archive::findByTitle>`
+ .. |Creator| replace:: :class:`zim::writer::Creator`
+ .. |WriterItem| replace:: :class:`zim::writer::Item`
+ .. |StringItem| replace:: :class:`zim::writer::StringItem`
+ .. |FileItem| replace:: :class:`zim::writer::FileItem`
+ .. |addMetadata| replace:: :func:`addMetadata<zim::writer::Creator::addMetadata>`
+ .. |addRedirection| replace:: :func:`addRedirection<zim::writer::Creator::addRedirection>`
+ .. |addIllustration| replace:: :func:`addIllustration<zim::writer::Creator::addIllustration>`
+ .. |HintKeys| replace:: :enum:`zim::writer::HintKeys`
--- /dev/null
+# Configuration file for the Sphinx documentation builder.
+#
+# This file only contains a selection of the most common options. For a full
+# list see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+# import sys
+# sys.path.insert(0, os.path.abspath('.'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = 'libzim'
+copyright = '2020, libzim-team'
+author = 'libzim-team'
+
+
+# -- General configuration ---------------------------------------------------
+
+on_rtd = os.environ.get('READTHEDOCS', None) == 'True'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'breathe',
+ 'exhale'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+if not on_rtd:
+ html_theme = 'sphinx_rtd_theme'
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+breathe_projects = {
+ "libzim": "./xml"
+}
+breathe_default_project = 'libzim'
+
+exhale_args = {
+ "containmentFolder": "./api",
+ "rootFileName": "ref_api.rst",
+ "rootFileTitle": "Reference API",
+ "doxygenStripFromPath":"..",
+ "treeViewIsBootstrap": True,
+ "createTreeView" : True,
+ "exhaleExecutesDoxygen": True,
+ "exhaleDoxygenStdin": "INPUT = ../include"
+}
+
+primary_domain = 'cpp'
+
+highlight_language = 'cpp'
--- /dev/null
+.. libzim documentation master file, created by
+ sphinx-quickstart on Fri Jul 24 15:40:50 2020.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Welcome to libzim's documentation!
+==================================
+
+.. toctree::
+ :maxdepth: 2
+ :caption: Contents:
+
+ usage
+ 6to7
+ api/ref_api
--- /dev/null
+
+sphinx = find_program('sphinx-build', native:true)
+
+sphinx_target = run_target('doc',
+ command: [sphinx, '-bhtml',
+ meson.current_source_dir(),
+ meson.current_build_dir()])
--- /dev/null
+breathe
+exhale
+sphinx<4
--- /dev/null
+Libzim programming
+==================
+
+Introduction
+------------
+
+libzim is written in C++. To use the library, you need the include files of libzim have
+to link against libzim.
+
+Errors are handled with exceptions. When something goes wrong, libzim throws an error,
+which is always derived from std::exception.
+
+All classes are defined in the namespace zim.
+Copying is allowed and tried to make as cheap as possible.
+The reading part of the libzim is most of the time thread safe.
+Searching and creating part are not. You have to serialize access to the class yourself.
+
+The main class, which accesses a archive is |Archive|.
+It has actually a reference to an implementation, so that copies of the class just references the same file.
+You open a file by passing the file name to the constructor as a std::string.
+
+Iterating over entries is made by creating a |EntryRange|.
+
+.. code-block:: c++
+
+ #include <zim/file.h>
+ #include <zim/fileiterator.h>
+ #include <iostream>
+ int main(int argc, char* argv[])
+ {
+ try
+ {
+ zim::Archive a("wikipedia.zim");
+
+ for (auto entry: a.iterByPath()) {
+ std::cout << "path: " << entry.getPath() << " title: " << entry.getTitle() << std::endl;
+ }
+ } catch (const std::exception& e) {
+ std::cerr << e.what() << std::endl;
+ }
+ }
+
+In subsequent examples, only code needed to use the library will be explained.
+The main-function with the error catcher should always be in place.
+
+Getting entries
+---------------
+
+Entries are addressed either by path or title.
+
+|Archive| has methods |getEntryByPath| and |getEntryByTitle|. Both take 1 parameters : a string, which specifies the path or the title of the entry to get.
+They return a |Entry|.
+If the entry cannot be found, they throw the exception |EntryNotFound|.
+
+Entry are entry point in a archive for "things". It can be a redirection to another entry or a |Item|
+
+ .. code-block:: c++
+
+ auto entry = archive.getEntryByPath("foo");
+ if (entry.isRedirect()) {
+ std::cout << "This is a redirection to " << entry.getRedirectEntry().getPath() << std::endl();
+ } else {
+ std::cout << "This is a item with content : " << entry.getItem().getData() << std::endl();
+ }
+
+As it is pretty common to resolve potential entry redirection and get the final item, you can do it directly using `getItem` :
+
+ .. code-block:: c++
+
+ auto entry = archive.getEntryByPath("foo");
+ auto item = entry.getItem(true);
+ if (entry.isRedirect()) {
+ std::cout << "Entry " << entry.getPath() << " is a entry pointing to the item " << item.getPath() << std::endl;
+ } else {
+ std::cout << entry.getPath() << " should be equal to " << item.getPath() << std::endl;
+ }
+ std::cout << "The item data is " << item.getData() << std::endl;
+
+Finding entries
+---------------
+
+|getEntryByPath|/|getEntryByTitle| allow to get a exact entry.
+But you may want to find entries using a more loosely method.
+|findByPath| and |findByTitle| allow you to find entries starting by the given path/title prefix.
+
+|findByPath|/|findByTitle| return a |EntryRange| you can iterate on :
+
+ .. code-block:: c++
+
+ for (auto entry: archive.findEntryByPath("fo")) {
+ std::cout << "Entry " << entry.getPath() << " should starts with fo." << std::endl;
+ }
+
+Searching for entries
+---------------------
+
+Find entries by path/title is nice but you may want to search for entries base on their content.
+If the zim archive contains a full text index, you can search on it.
+
+The class |Searcher| allow to search on one or several |Archive|.
+It allows to create a |Search| which represent a particular search for a |Query|.
+From a |Search|, you can get a |SearchResultSet| on which you can iterate.
+
+ .. code-block:: c++
+
+ // Create a searcher, something to search on an archive
+ zim::Searcher searcher(archive);
+
+ // We need a query to specify what to search for
+ zim::Query query;
+ query.setQuery("bar");
+
+ // Create a search for the specified query
+ zim::Search search = searcher.search(query);
+
+ // Now we can get some result from the search.
+ // 20 results starting from offset 10 (from 10 to 30)
+ zim::SearchResultSet results = search.getResults(10, 20);
+
+ // SearchResultSet is iterable
+ for(auto entry: results) {
+ std::cout << entry.getPath() << std::endl;
+ }
+
+Searching for suggestions
+-------------------------
+
+While |findByTitle| may be a good start to search for suggestion, you may want to search for suggestion for term
+in the middle of the suggestion.
+
+The suggestion API allow you to search for suggestion, using suggestion database included in recent zim files.
+The suggestion API is pretty close from the search API:
+
+ .. code-block:: c++
+
+ // Create a searcher, something to search on an archive
+ zim::SuggestionSearcher searcher(archive);
+
+ // Create a search for the specified query
+ zim::SuggestionSearch search = searcher.search("bar");
+
+ // Now we can get some result from the search.
+ // 20 results starting from offset 10 (from 10 to 30)
+ zim::SuggestionResultSet results = search.getResults(10, 20);
+
+ // SearchResultSet is iterable
+ for(auto entry: results) {
+ std::cout << entry.getPath() << std::endl;
+ }
+
+If the zim file doesn't contain a suggestion database, the suggestion will fallback to |findByTitle| for you.
+
+ .. Declare some replacement helpers
+
+ .. |Archive| replace:: :class:`zim::Archive`
+ .. |EntryRange| replace:: :class:`zim::Archive::EntryRange`
+ .. |Entry| replace:: :class:`zim::Entry`
+ .. |Item| replace:: :class:`zim::Item`
+ .. |EntryNotFound| replace:: :class:`zim::EntryNotFound`
+ .. |Searcher| replace:: :class:`zim::Searcher`
+ .. |Search| replace:: :class:`zim::Search`
+ .. |Query| replace:: :class:`zim::Query`
+ .. |SearchResultSet| replace:: :class:`zim::SearchResultSet`
+ .. |getEntryByPath| replace:: :func:`getEntryByPath<void zim::Archive::getEntryByPath(const std::string&) const>`
+ .. |getEntryByTitle| replace:: :func:`getEntryByTitle<void zim::Archive::getEntryByTitle(const std::string&) const>`
+ .. |findByPath| replace:: :func:`findByPath<zim::Archive::findByPath>`
+ .. |findByTitle| replace:: :func:`findByTitle<zim::Archive::findByTitle>`
+
--- /dev/null
+/*
+ * Copyright (C) 2012 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+
+#include <zim/writer/contentProvider.h>
+#include <zim/writer/creator.h>
+#include <zim/blob.h>
+
+class TestItem : public zim::writer::Item
+{
+ std::string _id;
+ std::string _data;
+
+ public:
+ TestItem() { }
+ explicit TestItem(const std::string& id);
+ virtual ~TestItem() = default;
+
+ virtual std::string getPath() const;
+ virtual std::string getTitle() const;
+ virtual std::string getMimeType() const;
+
+ virtual std::unique_ptr<zim::writer::ContentProvider> getContentProvider() const;
+};
+
+TestItem::TestItem(const std::string& id)
+ : _id(id)
+{
+ std::ostringstream data;
+ data << "this is item " << id << std::endl;
+ _data = data.str();
+}
+
+std::string TestItem::getPath() const
+{
+ return std::string("A/") + _id;
+}
+
+std::string TestItem::getTitle() const
+{
+ return _id;
+}
+
+std::string TestItem::getMimeType() const
+{
+ return "text/plain";
+}
+
+std::unique_ptr<zim::writer::ContentProvider> TestItem::getContentProvider() const
+{
+ return std::unique_ptr<zim::writer::ContentProvider>(new zim::writer::StringProvider(_data));
+}
+
+int main(int argc, char* argv[])
+{
+ unsigned max = 16;
+ try {
+ zim::writer::Creator c;
+ c.configVerbose(false).configCompression(zim::Compression::Zstd);
+ c.startZimCreation("foo.zim");
+ for (unsigned n = 0; n < max; ++n)
+ {
+ std::ostringstream id;
+ id << (n + 1);
+ auto article = std::make_shared<TestItem>(id.str());
+ c.addItem(article);
+ }
+ c.setMainPath("A/0");
+ c.finishZimCreation();
+ }
+ catch (const std::exception& e)
+ {
+ std::cerr << e.what() << std::endl;
+ }
+}
+
--- /dev/null
+
+executable('createZimExample', 'createZimExample.cpp',
+ link_with: libzim,
+ link_args: extra_link_args,
+ include_directories: include_directory,
+ dependencies: [thread_dep, xapian_dep, icu_dep, lzma_dep, zstd_dep])
--- /dev/null
+subdir('zim')
+
+include_directory = include_directories('.')
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ARCHIVE_H
+#define ZIM_ARCHIVE_H
+
+#include "zim.h"
+#include "entry.h"
+#include "uuid.h"
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <bitset>
+#include <set>
+
+namespace zim
+{
+ class FileImpl;
+
+ enum class EntryOrder {
+ pathOrder,
+ titleOrder,
+ efficientOrder
+ };
+
+ /**
+ * The Archive class to access content in a zim file.
+ *
+ * The `Archive` is the main class to access content in a zim file.
+ * `Archive` are lightweight object and can be copied easily.
+ *
+ * All methods of archive may throw an `ZimFileFormatError` if the file is invalid.
+ */
+ class Archive
+ {
+ public:
+ template<EntryOrder order> class EntryRange;
+ template<EntryOrder order> class iterator;
+
+ /** Archive constructor.
+ *
+ * Construct an archive from a filename.
+ * The file is open readonly.
+ *
+ * The filename is the "logical" path.
+ * So if you want to open a split zim file (foo.zimaa, foo.zimab, ...)
+ * you must pass the `foo.zim` path.
+ *
+ * @param fname The filename to the file to open (utf8 encoded)
+ */
+ explicit Archive(const std::string& fname);
+
+#ifndef _WIN32
+ /** Archive constructor.
+ *
+ * Construct an archive from a file descriptor.
+ *
+ * Note: This function is not available under Windows.
+ *
+ * @param fd The descriptor of a seekable file representing a ZIM archive
+ */
+ explicit Archive(int fd);
+
+ /** Archive constructor.
+ *
+ * Construct an archive from a descriptor of a file with an embedded ZIM
+ * archive inside.
+ *
+ * Note: This function is not available under Windows.
+ *
+ * @param fd The descriptor of a seekable file with a continuous segment
+ * representing a complete ZIM archive.
+ * @param offset The offset of the ZIM archive relative to the beginning
+ * of the file (rather than the current position associated with fd).
+ * @param size The size of the ZIM archive.
+ */
+ Archive(int fd, offset_type offset, size_type size);
+#endif
+
+ /** Return the filename of the zim file.
+ *
+ * Return the filename as passed to the constructor
+ * (So foo.zim).
+ *
+ * @return The logical filename of the archive.
+ */
+ const std::string& getFilename() const;
+
+ /** Return the logical archive size.
+ *
+ * Return the size of the full archive, not the size of the file on the fs.
+ * If the zim is split, return the sum of the size of the parts.
+ *
+ * @return The logical size of the archive.
+ */
+ size_type getFilesize() const;
+
+ /** Return the number of entries in the archive.
+ *
+ * Return the total number of entries in the archive, including
+ * internal entries created by libzim itself, metadata, indexes, ...
+ *
+ * @return the number of all entries in the archive.
+ */
+ entry_index_type getAllEntryCount() const;
+
+ /** Return the number of user entries in the archive.
+ *
+ * If the notion of "user entries" doesn't exist in the zim archive,
+ * returns `getAllEntryCount()`.
+ *
+ * @return the number of user entries in the archive.
+ */
+ entry_index_type getEntryCount() const;
+
+ /** Return the number of articles in the archive.
+ *
+ * The definition of "article" depends of the zim archive.
+ * On recent archives, this correspond to all entries marked as "FRONT_ARTICLE"
+ * at creaton time.
+ * On old archives, this correspond to all entries in 'A' namespace.
+ * Few archives may have been created without namespace but also without specific
+ * article listing. In this case, articles are all user entries.
+ *
+ * @return the number of articles in the archive.
+ */
+ entry_index_type getArticleCount() const;
+
+ /** The uuid of the archive.
+ *
+ * @return the uuid of the archive.
+ */
+ Uuid getUuid() const;
+
+ /** Get a specific metadata content.
+ *
+ * Get the content of a metadata stored in the archive.
+ *
+ * @param name The name of the metadata.
+ * @return The content of the metadata.
+ * @exception EntryNotFound If the metadata is not in the arcthive.
+ */
+ std::string getMetadata(const std::string& name) const;
+
+ /** Get a specific metadata item.
+ *
+ * Get the item associated to a metadata stored in the archive.
+ *
+ * @param name The name of the metadata.
+ * @return The item associated to the metadata.
+ * @exception EntryNotFound If the metadata in not in the archive.
+ */
+ Item getMetadataItem(const std::string& name) const;
+
+ /** Get the list of metadata stored in the archive.
+ *
+ * @return The list of metadata in the archive.
+ */
+ std::vector<std::string> getMetadataKeys() const;
+
+ /** Get the illustration item of the archive.
+ *
+ * Illustration is a icon for the archive that can be used in catalog and so to illustrate the archive.
+ *
+ * @param size The size (width and height) of the illustration to get. Default to 48 (48x48px icon)
+ * @return The illustration item.
+ * @exception EntryNotFound If no illustration item can be found.
+ */
+ Item getIllustrationItem(unsigned int size=48) const;
+
+ /** Return a list of available sizes (width) for the illustations in the archive.
+ *
+ * Illustration is an icon for the archive that can be used in catalog and elsewehere to illustrate the archive.
+ * An Archive may contains several illustrations with different size.
+ * This method allows to know which illustration are in the archive (by size: width)
+ *
+ * @return A set of size.
+ */
+ std::set<unsigned int> getIllustrationSizes() const;
+
+
+ /** Get an entry using its "path" index.
+ *
+ * Use the index of the entry to get the idx'th entry
+ * (entry being sorted by path).
+ *
+ * @param idx The index of the entry.
+ * @return The Entry.
+ * @exception std::out_of_range If idx is greater than the number of entry.
+ */
+ Entry getEntryByPath(entry_index_type idx) const;
+
+ /** Get an entry using a path.
+ *
+ * Get an entry using its path.
+ * The path must contains the namespace.
+ *
+ * @param path The entry's path.
+ * @return The Entry.
+ * @exception EntryNotFound If no entry has the asked path.
+ */
+ Entry getEntryByPath(const std::string& path) const;
+
+ /** Get an entry using its "title" index.
+ *
+ * Use the index of the entry to get the idx'th entry
+ * (entry being sorted by title).
+ *
+ * @param idx The index of the entry.
+ * @return The Entry.
+ * @exception std::out_of_range If idx is greater than the number of entry.
+ */
+ Entry getEntryByTitle(entry_index_type idx) const;
+
+ /** Get an entry using a title.
+ *
+ * Get an entry using its path.
+ *
+ * @param title The entry's title.
+ * @return The Entry.
+ * @exception EntryNotFound If no entry has the asked title.
+ */
+ Entry getEntryByTitle(const std::string& title) const;
+
+ /** Get an entry using its "cluster" index.
+ *
+ * Use the index of the entry to get the idx'th entry
+ * The actual order of the entry is not really specified.
+ * It is infered from the internal way the entry are stored.
+ *
+ * This method is probably not relevent and is provided for completeness.
+ * You should probably use a iterator using the `efficientOrder`.
+ *
+ * @param idx The index of the entry.
+ * @return The Entry.
+ * @exception std::out_of_range If idx is greater than the number of entry.
+ */
+ Entry getEntryByClusterOrder(entry_index_type idx) const;
+
+ /** Get the main entry of the archive.
+ *
+ * @return The Main entry.
+ * @exception EntryNotFound If no main entry has been specified in the archive.
+ */
+ Entry getMainEntry() const;
+
+ /** Get a random entry.
+ *
+ * The entry is picked randomly from the front artice list.
+ *
+ * @return A random entry.
+ * @exception EntryNotFound If no valid random entry can be found.
+ */
+ Entry getRandomEntry() const;
+
+ /** Check in an entry has path in the archive.
+ *
+ * @param path The entry's path.
+ * @return True if the path in the archive, false else.
+ */
+ bool hasEntryByPath(const std::string& path) const {
+ try{
+ getEntryByPath(path);
+ return true;
+ } catch(...) { return false; }
+ }
+
+ /** Check in an entry has title in the archive.
+ *
+ * @param title The entry's title.
+ * @return True if the title in the archive, false else.
+ */
+ bool hasEntryByTitle(const std::string& title) const {
+ try{
+ getEntryByTitle(title);
+ return true;
+ } catch(...) { return false; }
+ }
+
+ /** Check if archive has a main entry
+ *
+ * @return True if the archive has a main entry.
+ */
+ bool hasMainEntry() const;
+
+ /** Check if archive has a favicon entry
+ *
+ * @param size The size (width and height) of the illustration to check. Default to 48 (48x48px icon)
+ * @return True if the archive has a corresponding illustration entry.
+ * (Always True if the archive has no illustration, but a favicon)
+ */
+ bool hasIllustration(unsigned int size=48) const;
+
+ /** Check if the archive has a fulltext index.
+ *
+ * @return True if the archive has a fulltext index
+ */
+ bool hasFulltextIndex() const;
+
+ /** Check if the archive has a title index.
+ *
+ * @return True if the archive has a title index
+ */
+ bool hasTitleIndex() const;
+
+
+ /** Get a "iterable" by path order.
+ *
+ * This method allow to iterate on all user entries using a path order.
+ * If the notion of "user entries" doesn't exists (for old zim archive),
+ * this iterate on all entries in the zim file.
+ *
+ * ```
+ * for(auto& entry:archive.iterByPath()) {
+ * ...
+ * }
+ * ```
+ *
+ * @return A range on all the entries, in path order.
+ */
+ EntryRange<EntryOrder::pathOrder> iterByPath() const;
+
+ /** Get a "iterable" by title order.
+ *
+ * This method allow to iterate on all articles using a title order.
+ * The definition of "article" depends of the zim archive.
+ * On recent archives, this correspond to all entries marked as "FRONT_ARTICLE"
+ * at creaton time.
+ * On old archives, this correspond to all entries in 'A' namespace.
+ * Few archives may have been created without namespace but also without specific
+ * article listing. In this case, this iterate on all user entries.
+ *
+ * ```
+ * for(auto& entry:archive.iterByTitle()) {
+ * ...
+ * }
+ * ```
+ *
+ * @return A range on all the entries, in title order.
+ */
+ EntryRange<EntryOrder::titleOrder> iterByTitle() const;
+
+ /** Get a "iterable" by a efficient order.
+ *
+ * This method allow to iterate on all user entries using a effictient order.
+ * If the notion of "user entries" doesn't exists (for old zim archive),
+ * this iterate on all entries in the zim file.
+ *
+ * ```
+ * for(auto& entry:archive.iterEfficient()) {
+ * ...
+ * }
+ * ```
+ *
+ * @return A range on all the entries, in efficitent order.
+ */
+ EntryRange<EntryOrder::efficientOrder> iterEfficient() const;
+
+ /** Find a range of entry starting with path.
+ *
+ * The path is the "long path". (Ie, with the namespace)
+ *
+ * @param path The path prefix to search for.
+ * @return A range starting from the first entry starting with path
+ * and ending past the last entry.
+ * If no entry starts with `path`, begin == end.
+ */
+ EntryRange<EntryOrder::pathOrder> findByPath(std::string path) const;
+
+ /** Find a range of entry starting with title.
+ *
+ * The entry title is search in `A` namespace.
+ *
+ * @param title The title prefix to search for.
+ * @return A range starting from the first entry starting with title
+ * and ending past the last entry.
+ * If no entry starts with `title`, begin == end.
+ */
+ EntryRange<EntryOrder::titleOrder> findByTitle(std::string title) const;
+
+ /** hasChecksum.
+ *
+ * The checksum is not the checksum of the file.
+ * It is an internal checksum stored in the zim file.
+ *
+ * @return True if the archive has a checksum.
+ */
+ bool hasChecksum() const;
+
+ /** getChecksum.
+ *
+ * @return the checksum stored in the archive.
+ * If the archive has no checksum return an empty string.
+ */
+ std::string getChecksum() const;
+
+ /** Check that the zim file is valid (in regard to its checksum).
+ *
+ * If the zim file has no checksum return false.
+ *
+ * @return True if the file is valid.
+ */
+ bool check() const;
+
+ /** Check the integrity of the zim file.
+ *
+ * Run different type of checks to verify the zim file is valid
+ * (in regard to the zim format).
+ * This may be time consuming.
+ *
+ * @return True if the file is valid.
+ */
+ bool checkIntegrity(IntegrityCheck checkType);
+
+ /** Check if the file is split in the filesystem.
+ *
+ * @return True if the archive is split in different file (foo.zimaa, foo.zimbb).
+ */
+ bool isMultiPart() const;
+
+ /** Get if the zim archive uses the new namespace scheme.
+ *
+ * Recent zim file use the new namespace scheme.
+ *
+ * On user perspective, it means that :
+ * - On old namespace scheme :
+ * . All entries are accessible, either using `getEntryByPath` with a specific namespace
+ * or simply iterating over the entries (with `iter*` methods).
+ * . Entry's path has namespace included ("A/foo.html")
+ * - On new namespace scheme :
+ * . Only the "user" entries are accessible with `getEntryByPath` and `iter*` methods.
+ * To access metadatas, use `getMetadata` method.
+ * . Entry's path do not contains namespace ("foo.html")
+ */
+ bool hasNewNamespaceScheme() const;
+
+ /** Get a shared ptr on the FileImpl
+ *
+ * @internal
+ * @return The shared_ptr
+ */
+ std::shared_ptr<FileImpl> getImpl() const { return m_impl; }
+
+#ifdef ZIM_PRIVATE
+ cluster_index_type getClusterCount() const;
+ offset_type getClusterOffset(cluster_index_type idx) const;
+ entry_index_type getMainEntryIndex() const;
+#endif
+
+ private:
+ std::shared_ptr<FileImpl> m_impl;
+ };
+
+ template<EntryOrder order>
+ entry_index_type _toPathOrder(const FileImpl& file, entry_index_type idx);
+
+ template<>
+ entry_index_type _toPathOrder<EntryOrder::pathOrder>(const FileImpl& file, entry_index_type idx);
+ template<>
+ entry_index_type _toPathOrder<EntryOrder::titleOrder>(const FileImpl& file, entry_index_type idx);
+ template<>
+ entry_index_type _toPathOrder<EntryOrder::efficientOrder>(const FileImpl& file, entry_index_type idx);
+
+
+ template<EntryOrder order>
+ class Archive::EntryRange {
+ public:
+ explicit EntryRange(const std::shared_ptr<FileImpl> file, entry_index_type begin, entry_index_type end)
+ : m_file(file),
+ m_begin(begin),
+ m_end(end)
+ {}
+
+ iterator<order> begin() const
+ { return iterator<order>(m_file, entry_index_type(m_begin)); }
+ iterator<order> end() const
+ { return iterator<order>(m_file, entry_index_type(m_end)); }
+ int size() const
+ { return m_end - m_begin; }
+
+ EntryRange<order> offset(int start, int maxResults) const
+ {
+ auto begin = m_begin + start;
+ if (begin > m_end) {
+ begin = m_end;
+ }
+ auto end = m_end;
+ if (begin + maxResults < end) {
+ end = begin + maxResults;
+ }
+ return EntryRange<order>(m_file, begin, end);
+ }
+
+private:
+ std::shared_ptr<FileImpl> m_file;
+ entry_index_type m_begin;
+ entry_index_type m_end;
+ };
+
+ template<EntryOrder order>
+ class Archive::iterator : public std::iterator<std::bidirectional_iterator_tag, Entry>
+ {
+ public:
+ explicit iterator(const std::shared_ptr<FileImpl> file, entry_index_type idx)
+ : m_file(file),
+ m_idx(idx),
+ m_entry(nullptr)
+ {}
+
+ iterator(const iterator<order>& other)
+ : m_file(other.m_file),
+ m_idx(other.m_idx),
+ m_entry(other.m_entry?new Entry(*other.m_entry):nullptr)
+ {}
+
+ bool operator== (const iterator<order>& it) const
+ { return m_file == it.m_file && m_idx == it.m_idx; }
+ bool operator!= (const iterator<order>& it) const
+ { return !operator==(it); }
+
+ iterator<order>& operator=(iterator<order>&& it) = default;
+
+ iterator<order>& operator=(iterator<order>& it)
+ {
+ m_entry.reset();
+ m_idx = it.m_idx;
+ m_file = it.m_file;
+ return *this;
+ }
+
+ iterator<order>& operator++()
+ {
+ ++m_idx;
+ m_entry.reset();
+ return *this;
+ }
+
+ iterator<order> operator++(int)
+ {
+ auto it = *this;
+ operator++();
+ return it;
+ }
+
+ iterator<order>& operator--()
+ {
+ --m_idx;
+ m_entry.reset();
+ return *this;
+ }
+
+ iterator<order> operator--(int)
+ {
+ auto it = *this;
+ operator--();
+ return it;
+ }
+
+ const Entry& operator*() const
+ {
+ if (!m_entry) {
+ m_entry.reset(new Entry(m_file, _toPathOrder<order>(*m_file, m_idx)));
+ }
+ return *m_entry;
+ }
+
+ const Entry* operator->() const
+ {
+ operator*();
+ return m_entry.get();
+ }
+
+ private:
+ std::shared_ptr<FileImpl> m_file;
+ entry_index_type m_idx;
+ mutable std::unique_ptr<Entry> m_entry;
+ };
+
+ typedef std::bitset<size_t(IntegrityCheck::COUNT)> IntegrityCheckList;
+ bool validate(const std::string& zimPath, IntegrityCheckList checksToRun);
+}
+
+#endif // ZIM_ARCHIVE_H
+
--- /dev/null
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_BLOB_H
+#define ZIM_BLOB_H
+
+#include "zim.h"
+
+#include <iostream>
+#include <string>
+#include <algorithm>
+#include <memory>
+
+namespace zim
+{
+ class Blob
+ {
+ public: // types
+ using DataPtr = std::shared_ptr<const char>;
+
+ public: // functions
+ Blob();
+ Blob(const char* data, size_type size);
+ Blob(const DataPtr& buffer, size_type size);
+
+ operator std::string() const { return std::string(_data.get(), _size); }
+ const char* data() const { return _data.get(); }
+ const char* end() const { return _data.get() + _size; }
+ size_type size() const { return _size; }
+
+ private:
+ DataPtr _data;
+ size_type _size;
+ };
+
+ inline std::ostream& operator<< (std::ostream& out, const Blob& blob)
+ {
+ if (blob.data())
+ out.write(blob.data(), blob.size());
+ return out;
+ }
+
+ inline bool operator== (const Blob& b1, const Blob& b2)
+ {
+ return b1.size() == b2.size()
+ && std::equal(b1.data(), b1.data() + b1.size(), b2.data());
+ }
+}
+
+#endif // ZIM_BLOB_H
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ENTRY_H
+#define ZIM_ENTRY_H
+
+#include "zim.h"
+
+#include <string>
+#include <memory>
+
+namespace zim
+{
+ class Item;
+ class Dirent;
+ class FileImpl;
+
+ class Entry
+ {
+ public:
+ explicit Entry(std::shared_ptr<FileImpl> file_, entry_index_type idx_);
+
+ bool isRedirect() const;
+ std::string getTitle() const;
+ std::string getPath() const;
+
+ /** Get the item associated to the entry.
+ *
+ * An item is associated only if the entry is not a redirect.
+ * For convenience, if follow is true, return the item associated to the targeted entry.
+ *
+ * @param follow True if the redirection is resolved before getting the item. (false by default)
+ * @return The Item associated to the entry.
+ * @exception InvalidType if the entry is a redirection and follow is false.
+ */
+ Item getItem(bool follow=false) const;
+
+ /** Get the item associated to the target entry.
+ *
+ * If there is a chain of redirection, the whole chain is resolved
+ * and the item associted to the last entry is returned.
+ *
+ * @return the Item associated with the targeted entry.
+ * @exception InvalidType if the entry is not a redirection.
+ */
+ Item getRedirect() const;
+
+ /** Get the Entry targeted by the entry.
+ *
+ * @return The entry directly targeted by this redirect entry.
+ * @exception InvalidEntry in the entry is not a redirection.
+ */
+ Entry getRedirectEntry() const;
+
+ entry_index_type getIndex() const { return m_idx; }
+
+ private:
+ std::shared_ptr<FileImpl> m_file;
+ entry_index_type m_idx;
+ std::shared_ptr<const Dirent> m_dirent;
+ };
+
+}
+
+#endif // ZIM_ENTRY_H
+
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ERROR_H
+#define ZIM_ERROR_H
+
+#include <stdexcept>
+
+namespace zim
+{
+ class ZimFileFormatError : public std::runtime_error
+ {
+ public:
+ explicit ZimFileFormatError(const std::string& msg)
+ : std::runtime_error(msg)
+ { }
+ };
+
+ class InvalidType: public std::logic_error
+ {
+ public:
+ explicit InvalidType(const std::string& msg)
+ : std::logic_error(msg)
+ {}
+ };
+
+ class EntryNotFound : public std::runtime_error
+ {
+ public:
+ explicit EntryNotFound(const std::string& msg)
+ : std::runtime_error(msg)
+ {}
+ };
+}
+
+#endif // ZIM_ERROR_H
+
--- /dev/null
+/*
+ * Copyright (C) 2021 Veloman Yunkan
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ITEM_H
+#define ZIM_ITEM_H
+
+#include "zim.h"
+#include "blob.h"
+#include <string>
+
+namespace zim
+{
+ class Dirent;
+ class FileImpl;
+
+ class Item
+ {
+ public: // types
+ typedef std::pair<std::string, offset_type> DirectAccessInfo;
+
+ public: // functions
+ explicit Item(std::shared_ptr<FileImpl> file_, entry_index_type idx_);
+
+ std::string getTitle() const;
+ std::string getPath() const;
+ std::string getMimetype() const;
+
+ /** Get the data associated to the item
+ *
+ * Get the data of the item, starting at offset.
+ *
+ * @param offset The number of byte to skip at begining of the data.
+ * @return A blob corresponding to the data.
+ */
+ Blob getData(offset_type offset=0) const;
+
+ /** Get the data associated to the item
+ *
+ * Get the `size` bytes of data of the item, starting at offset.
+ *
+ * @param offset The number of byte to skip at begining of the data.
+ * @param size The number of byte to read.
+ * @return A blob corresponding to the data.
+ */
+ Blob getData(offset_type offset, size_type size) const;
+
+ /** The size of the item.
+ *
+ * @return The size (in byte) of the item.
+ */
+ size_type getSize() const;
+
+ /** Direct access information.
+ *
+ * Some item are stored raw in the zim file.
+ * If possible, this function give information about which file
+ * and at which to read to get the data.
+ *
+ * It can be usefull as an optimisation when interacting with other system
+ * by reopeing the file and reading the content bypassing the libzim.
+ *
+ * @return A pair of filename/offset specifying where read the content.
+ * If it is not possible to have direct access for this item,
+ * return a pair of `{"", 0}`
+ */
+ DirectAccessInfo getDirectAccessInformation() const;
+
+ entry_index_type getIndex() const { return m_idx; }
+
+#ifdef ZIM_PRIVATE
+ cluster_index_type getClusterIndex() const;
+#endif
+
+ private: // data
+ std::shared_ptr<FileImpl> m_file;
+ entry_index_type m_idx;
+ std::shared_ptr<const Dirent> m_dirent;
+ };
+
+}
+
+#endif // ZIM_ITEM_H
+
--- /dev/null
+zim_config = configure_file(output : 'zim_config.h',
+ configuration : public_conf)
+
+install_headers(
+ 'archive.h',
+ 'blob.h',
+ 'error.h',
+ 'item.h',
+ 'entry.h',
+ 'uuid.h',
+ 'zim.h',
+ 'suggestion.h',
+ 'suggestion_iterator.h',
+ 'version.h',
+ zim_config,
+ subdir:'zim'
+)
+
+if xapian_dep.found()
+ install_headers(
+ 'search.h',
+ 'search_iterator.h',
+ subdir:'zim'
+ )
+endif
+
+install_headers(
+ 'writer/item.h',
+ 'writer/creator.h',
+ 'writer/contentProvider.h',
+ subdir:'zim/writer'
+)
+
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2007 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SEARCH_H
+#define ZIM_SEARCH_H
+
+#include "search_iterator.h"
+#include "archive.h"
+#include <vector>
+#include <string>
+#include <map>
+
+namespace Xapian {
+ class Enquire;
+ class MSet;
+};
+
+namespace zim
+{
+
+class Archive;
+class InternalDataBase;
+class Query;
+class Search;
+class SearchResultSet;
+
+/**
+ * A Searcher is a object fulltext searching a set of Archives
+ *
+ * A Searcher is mainly used to create new `Search`
+ * Internaly, this is mainly a wrapper around a Xapian database.
+ *
+ * You should consider that all search operations are NOT threadsafe.
+ * It is up to you to protect your calls to avoid race competition.
+ * However, Searcher (and subsequent classes) do not maintain a global/share state.
+ * You can create several Searchers and use them in different threads.
+ */
+class Searcher
+{
+ public:
+ /** Searcher constructor.
+ *
+ * Construct a searcher on top of several archives (multi search).
+ *
+ * @param archives A list(vector) of archives to search on.
+ */
+ explicit Searcher(const std::vector<Archive>& archives);
+
+ /** Searcher constructor.
+ *
+ * Construct a searcher on top of on archive.
+ *
+ * @param archive A archive to search on.
+ */
+ explicit Searcher(const Archive& archive);
+ Searcher(const Searcher& other);
+ Searcher& operator=(const Searcher& other);
+ Searcher(Searcher&& other);
+ Searcher& operator=(Searcher&& other);
+ ~Searcher();
+
+ /** Add a archive to the searcher.
+ *
+ * Adding a archive to a searcher do not invalidate already created search.
+ */
+ Searcher& addArchive(const Archive& archive);
+
+ /** Create a search for a specific query.
+ *
+ * The search is made on all archives added to the Searcher.
+ *
+ * @param query The Query to search.
+ *
+ * @throws std::runtime_error if the searcher does not have a valid
+ * FT database.
+ */
+ Search search(const Query& query);
+
+ /** Set the verbosity of search operations.
+ *
+ * @param verbose The verbose mode to set
+ */
+ void setVerbose(bool verbose);
+
+ private: // methods
+ void initDatabase();
+
+ private: // data
+ std::shared_ptr<InternalDataBase> mp_internalDb;
+ std::vector<Archive> m_archives;
+ bool m_verbose;
+};
+
+/**
+ * A Query represent a query.
+ *
+ * It describe what have to be searched and how.
+ * A Query is "database" independent.
+ */
+class Query
+{
+ public:
+ /** Query constructor.
+ *
+ * Create a empty query.
+ */
+ Query(const std::string& query = "");
+
+ /** Set the textual query of the Query.
+ *
+ * @param query The string to search for.
+ */
+ Query& setQuery(const std::string& query);
+
+ /** Set the geographical query of the Query.
+ *
+ * Some article may be geo positioned.
+ * You can search for articles in a certain distance of a point.
+ *
+ * @param latitude The latitute of the point.
+ * @param longitude The longitude of the point.
+ * @param distance The maximal distance from the point.
+ */
+ Query& setGeorange(float latitude, float longitude, float distance);
+
+ std::string m_query { "" };
+
+ bool m_geoquery { false };
+ float m_latitude { 0 };
+ float m_longitude { 0 };
+ float m_distance { 0 } ;
+};
+
+
+/**
+ * A Search represent a particular search, based on a `Searcher`.
+ *
+ * This is somehow the reunification of a `Searcher` (what to search on)
+ * and a `Query` (what to search for).
+ */
+class Search
+{
+ public:
+ Search(Search&& s);
+ Search& operator=(Search&& s);
+ ~Search();
+
+ /** Get a set of results for this search.
+ *
+ * @param start The begining of the range to get
+ * (offset of the first result).
+ * @param maxResults The maximum number of results to return
+ * (offset of last result from the start of range).
+ */
+ const SearchResultSet getResults(int start, int maxResults) const;
+
+ /** Get the number of estimated results for this search.
+ *
+ * As the name suggest, it is a estimation of the number of results.
+ */
+ int getEstimatedMatches() const;
+
+ private: // methods
+ Search(std::shared_ptr<InternalDataBase> p_internalDb, const Query& query);
+ Xapian::Enquire& getEnquire() const;
+
+ private: // data
+ std::shared_ptr<InternalDataBase> mp_internalDb;
+ mutable std::unique_ptr<Xapian::Enquire> mp_enquire;
+ Query m_query;
+
+ friend class Searcher;
+};
+
+/**
+ * The `SearchResult` represent a range of results corresponding to a `Search`.
+ *
+ * It mainly allows to get a iterator.
+ */
+class SearchResultSet
+{
+ public:
+ typedef SearchIterator iterator;
+
+ /** The begin iterator on the result range. */
+ iterator begin() const;
+
+ /** The end iterator on the result range. */
+ iterator end() const;
+
+ /** The size of the SearchResult (end()-begin()) */
+ int size() const;
+
+ private:
+ SearchResultSet(std::shared_ptr<InternalDataBase> p_internalDb, Xapian::MSet&& mset);
+ SearchResultSet(std::shared_ptr<InternalDataBase> p_internalDb);
+
+ private: // data
+ std::shared_ptr<InternalDataBase> mp_internalDb;
+ std::shared_ptr<Xapian::MSet> mp_mset;
+ friend class Search;
+};
+
+} //namespace zim
+
+#endif // ZIM_SEARCH_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SEARCH_ITERATOR_H
+#define ZIM_SEARCH_ITERATOR_H
+
+#include <memory>
+#include <iterator>
+#include "entry.h"
+#include "archive.h"
+#include "uuid.h"
+
+namespace zim
+{
+class SearchResultSet;
+
+class SearchIterator : public std::iterator<std::bidirectional_iterator_tag, Entry>
+{
+ friend class zim::SearchResultSet;
+ public:
+ SearchIterator();
+ SearchIterator(const SearchIterator& it);
+ SearchIterator& operator=(const SearchIterator& it);
+ SearchIterator(SearchIterator&& it);
+ SearchIterator& operator=(SearchIterator&& it);
+ ~SearchIterator();
+
+ bool operator== (const SearchIterator& it) const;
+ bool operator!= (const SearchIterator& it) const;
+
+ SearchIterator& operator++();
+ SearchIterator operator++(int);
+ SearchIterator& operator--();
+ SearchIterator operator--(int);
+
+ std::string getPath() const;
+ std::string getTitle() const;
+ int getScore() const;
+ std::string getSnippet() const;
+ int getWordCount() const;
+ int getSize() const;
+ int getFileIndex() const;
+ Uuid getZimId() const;
+ reference operator*() const;
+ pointer operator->() const;
+
+#ifdef ZIM_PRIVATE
+ std::string getDbData() const;
+#endif
+
+ private:
+ struct InternalData;
+ std::unique_ptr<InternalData> internal;
+ SearchIterator(InternalData* internal_data);
+
+ bool isEnd() const;
+};
+
+} // namespace zim
+
+#endif // ZIM_SEARCH_ITERATOR_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2007 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SUGGESTION_H
+#define ZIM_SUGGESTION_H
+
+#include "suggestion_iterator.h"
+#include "archive.h"
+
+#if defined(LIBZIM_WITH_XAPIAN)
+namespace Xapian {
+ class Enquire;
+ class MSet;
+};
+#endif
+
+namespace zim
+{
+
+class SuggestionSearcher;
+class SuggestionSearch;
+class SuggestionIterator;
+class SuggestionDataBase;
+
+/**
+ * A SuggestionSearcher is a object suggesting over titles of an Archive
+ *
+ * A SuggestionSearcher is mainly used to create new `SuggestionSearch`
+ * Internaly, this is a wrapper around a SuggestionDataBase with may or may not
+ * include a Xapian index.
+ *
+ * You should consider that all search operations are NOT threadsafe.
+ * It is up to you to protect your calls to avoid race competition.
+ * However, SuggestionSearcher (and subsequent classes) do not maintain a global/
+ * share state You can create several Searchers and use them in different threads.
+ */
+class SuggestionSearcher
+{
+ public:
+ /** SuggestionSearcher constructor.
+ *
+ * Construct a SuggestionSearcher on top of an archive.
+ *
+ * @param archive An archive to suggest on.
+ */
+ explicit SuggestionSearcher(const Archive& archive);
+
+ SuggestionSearcher(const SuggestionSearcher& other);
+ SuggestionSearcher& operator=(const SuggestionSearcher& other);
+ SuggestionSearcher(SuggestionSearcher&& other);
+ SuggestionSearcher& operator=(SuggestionSearcher&& other);
+ ~SuggestionSearcher();
+
+ /** Create a SuggestionSearch for a specific query.
+ *
+ * The search is made on the archive under the SuggestionSearcher.
+ *
+ * @param query The SuggestionQuery to search.
+ */
+ SuggestionSearch suggest(const std::string& query);
+
+ /** Set the verbosity of search operations.
+ *
+ * @param verbose The verbose mode to set
+ */
+ void setVerbose(bool verbose);
+
+ private: // methods
+ void initDatabase();
+
+ private: // data
+ std::shared_ptr<SuggestionDataBase> mp_internalDb;
+ Archive m_archive;
+ bool m_verbose;
+};
+
+/**
+ * A SuggestionSearch represent a particular suggestion search, based on a `SuggestionSearcher`.
+ */
+class SuggestionSearch
+{
+ public:
+ SuggestionSearch(SuggestionSearch&& s);
+ SuggestionSearch& operator=(SuggestionSearch&& s);
+ ~SuggestionSearch();
+
+ /** Get a set of results for this search.
+ *
+ * @param start The begining of the range to get
+ * (offset of the first result).
+ * @param maxResults The maximum number of results to return
+ * (offset of last result from the start of range).
+ */
+ const SuggestionResultSet getResults(int start, int maxResults) const;
+
+ /** Get the number of estimated results for this suggestion search.
+ *
+ * As the name suggest, it is a estimation of the number of results.
+ */
+ int getEstimatedMatches() const;
+
+ private: // methods
+ SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb, const std::string& query);
+
+ private: // data
+ std::shared_ptr<SuggestionDataBase> mp_internalDb;
+ std::string m_query;
+
+ friend class SuggestionSearcher;
+
+#ifdef ZIM_PRIVATE
+ public:
+ // Close Xapian db to force range based search
+ const void forceRangeSuggestion();
+#endif
+
+// Xapian based methods and data
+#if defined(LIBZIM_WITH_XAPIAN)
+ private: // Xapian based methods
+ Xapian::Enquire& getEnquire() const;
+
+ private: // Xapian based data
+ mutable std::unique_ptr<Xapian::Enquire> mp_enquire;
+#endif // LIBZIM_WITH_XAPIAN
+};
+
+/**
+ * The `SuggestionResultSet` represent a range of results corresponding to a `SuggestionSearch`.
+ *
+ * It mainly allows to get a iterator either based on an MSetIterator or a RangeIterator.
+ */
+class SuggestionResultSet
+{
+ public:
+ typedef SuggestionIterator iterator;
+ typedef Archive::EntryRange<EntryOrder::titleOrder> EntryRange;
+
+ /** The begin iterator on the result range. */
+ iterator begin() const;
+
+ /** The end iterator on the result range. */
+ iterator end() const;
+
+ /** The size of the SearchResult (end()-begin()) */
+ int size() const;
+
+ private: // data
+ std::shared_ptr<SuggestionDataBase> mp_internalDb;
+ std::shared_ptr<EntryRange> mp_entryRange;
+
+ private:
+ SuggestionResultSet(EntryRange entryRange);
+
+ friend class SuggestionSearch;
+
+// Xapian based methods and data
+#if defined(LIBZIM_WITH_XAPIAN)
+
+ private: // Xapian based methods
+ SuggestionResultSet(std::shared_ptr<SuggestionDataBase> p_internalDb, Xapian::MSet&& mset);
+
+ private: // Xapian based data
+ std::shared_ptr<Xapian::MSet> mp_mset;
+
+#endif // LIBZIM_WITH_XAPIAN
+};
+
+} // namespace zim
+
+#endif // ZIM_SUGGESTION_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SUGGESTION_ITERATOR_H
+#define ZIM_SUGGESTION_ITERATOR_H
+
+#include "archive.h"
+#include <iterator>
+
+namespace zim
+{
+class SuggestionResultSet;
+class SuggestionItem;
+class SearchIterator;
+
+class SuggestionIterator : public std::iterator<std::bidirectional_iterator_tag, SuggestionItem>
+{
+ typedef Archive::iterator<EntryOrder::titleOrder> RangeIterator;
+ friend class SuggestionResultSet;
+ public:
+ SuggestionIterator() = delete;
+ SuggestionIterator(const SuggestionIterator& it);
+ SuggestionIterator& operator=(const SuggestionIterator& it);
+ SuggestionIterator(SuggestionIterator&& it);
+ SuggestionIterator& operator=(SuggestionIterator&& it);
+ ~SuggestionIterator();
+
+ bool operator== (const SuggestionIterator& it) const;
+ bool operator!= (const SuggestionIterator& it) const;
+
+ SuggestionIterator& operator++();
+ SuggestionIterator operator++(int);
+ SuggestionIterator& operator--();
+ SuggestionIterator operator--(int);
+
+ Entry getEntry() const;
+
+ const SuggestionItem& operator*();
+ const SuggestionItem* operator->();
+
+ private: // data
+ struct SuggestionInternalData;
+ std::unique_ptr<RangeIterator> mp_rangeIterator;
+ std::unique_ptr<SuggestionItem> m_suggestionItem;
+
+ private: // methods
+ SuggestionIterator(RangeIterator rangeIterator);
+
+// Xapian based methods and data
+#if defined(LIBZIM_WITH_XAPIAN)
+#ifdef ZIM_PRIVATE
+ public:
+ std::string getDbData() const;
+#endif
+ private: // xapian based data
+ std::unique_ptr<SuggestionInternalData> mp_internal;
+
+ private: // xapian based methods
+ std::string getIndexPath() const;
+ std::string getIndexTitle() const;
+ std::string getIndexSnippet() const;
+ SuggestionIterator(SuggestionInternalData* internal_data);
+#endif // LIBZIM_WITH_XAPIAN
+};
+
+class SuggestionItem
+{
+ public: // methods
+ std::string getTitle() const { return title; }
+ std::string getPath() const { return path; }
+ std::string getSnippet() const { return snippet; }
+
+ bool hasSnippet() const { return !snippet.empty(); }
+
+ private: // data
+ std::string title;
+ std::string path;
+ std::string snippet;
+
+ private: // methods
+ explicit SuggestionItem(std::string title, std::string path, std::string snippet = "")
+ : title(title),
+ path(path),
+ snippet(snippet) {}
+
+ friend class SuggestionIterator;
+};
+
+} // namespace zim
+
+#endif // ZIM_SUGGESTION_ITERATOR_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Mannesh P M <manu.pm55@gmaile.com>
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_UUID_H
+#define ZIM_UUID_H
+
+#include <iosfwd>
+#include <algorithm>
+#include <cstring>
+#include <string>
+
+namespace zim
+{
+ struct Uuid
+ {
+ Uuid()
+ {
+ std::memset(data, 0, 16);
+ }
+
+ Uuid(const char uuid[16])
+ {
+ std::copy(uuid, uuid+16, data);
+ }
+
+ static Uuid generate(std::string value = "");
+
+ bool operator== (const Uuid& other) const
+ { return std::equal(data, data+16, other.data); }
+ bool operator!= (const Uuid& other) const
+ { return !(*this == other); }
+ unsigned size() const { return 16; }
+
+ explicit operator std::string() const;
+
+ char data[16];
+ };
+
+ std::ostream& operator<< (std::ostream& out, const Uuid& uuid);
+
+}
+
+#endif // ZIM_UUID_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_VERSION_H
+#define ZIM_VERSION_H
+
+#include <string>
+#include <vector>
+
+namespace zim
+{
+ typedef std::vector<std::pair<std::string, std::string>> LibVersions;
+ LibVersions getVersions();
+ void printVersions(std::ostream& out = std::cout);
+}
+
+#endif // ZIM_VERSION_H
+
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_CONTENTPROVIDER_H
+#define ZIM_WRITER_CONTENTPROVIDER_H
+
+#include <stdexcept>
+#include <zim/blob.h>
+#include <zim/zim.h>
+#include <string>
+
+namespace zim
+{
+#ifdef _WIN32
+ #define DEFAULTFD zim::windows::FD
+ namespace windows {
+#else
+ #define DEFAULTFD zim::unix::FD
+ namespace unix {
+#endif
+ class FD;
+ }
+ namespace writer
+ {
+ /**
+ * `ContentProvider` is an abstract class in charge of providing the content to
+ * add in the archive to the creator.
+ */
+ class ContentProvider {
+ public:
+ virtual ~ContentProvider() = default;
+ /**
+ * The size of the content to add into the archive.
+ *
+ * @return the total size of the content.
+ */
+ virtual zim::size_type getSize() const = 0;
+
+ /**
+ * Return a blob to add to the archive.
+ *
+ * The returned blob doesn't have to represent the whole content.
+ * The feed method can return the whole content chunk by chunk or in
+ * one step.
+ * When the whole content has been returned, feed must return an empty blob
+ * (size == 0).
+ *
+ * This method will be called several times (at least twice) for
+ * each content to add.
+ *
+ * It is up to the implementation to manage correctly the data pointed by
+ * the returned blob.
+ * It may (re)use the same buffer between calls (rewriting its content),
+ * create a new buffer each time or make the blob point to a new region of
+ * a big buffer.
+ * It is up to the implementation to free any allocated memory.
+ *
+ * The data pointed by the blob must stay valid until the next call to feed.
+ * A call to feed ensure that the data returned by a previous call will not
+ * be used anymore.
+ */
+ virtual Blob feed() = 0;
+ };
+
+ /**
+ * StringProvider provide the content stored in a string.
+ */
+ class StringProvider : public ContentProvider {
+ public:
+ /**
+ * Create a provider using a string as content.
+ * The string content is copied and the reference don't have to be "keep" alive.
+ *
+ * @param content the content to serve.
+ */
+ explicit StringProvider(const std::string& content)
+ : content(content),
+ feeded(false)
+ {}
+ zim::size_type getSize() const { return content.size(); }
+ Blob feed();
+
+ protected:
+ std::string content;
+ bool feeded;
+ };
+
+ /**
+ * SharedStringProvider provide the content stored in a shared string.
+ *
+ * It is mostly the same thing that `StringProvider` but use a shared_ptr
+ * to avoid copy.
+ */
+ class SharedStringProvider : public ContentProvider {
+ public:
+ /**
+ * Create a provider using a string as content.
+ * The string content is not copied.
+ *
+ * @param content the content to serve.
+ */
+ explicit SharedStringProvider(std::shared_ptr<const std::string> content)
+ : content(content),
+ feeded(false)
+ {}
+ zim::size_type getSize() const { return content->size(); }
+ Blob feed();
+
+ protected:
+ std::shared_ptr<const std::string> content;
+ bool feeded;
+
+ };
+
+
+ /**
+ * FileProvider provide the content stored in file.
+ */
+ class FileProvider : public ContentProvider {
+ public:
+ /**
+ * Create a provider using file as content.
+ *
+ * @param filepath the path to the file to serve.
+ */
+ explicit FileProvider(const std::string& filepath);
+ ~FileProvider();
+ zim::size_type getSize() const { return size; }
+ Blob feed();
+
+ protected:
+ std::string filepath;
+ zim::size_type size;
+
+ private:
+ std::unique_ptr<char[]> buffer;
+ std::unique_ptr<DEFAULTFD> fd;
+ zim::offset_type offset;
+ };
+
+ }
+}
+
+#undef DEFAULTFD
+
+#endif // ZIM_WRITER_CONTENTPROVIDER_H
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_CREATOR_H
+#define ZIM_WRITER_CREATOR_H
+
+#include <memory>
+#include <zim/zim.h>
+#include <zim/writer/item.h>
+
+namespace zim
+{
+ class Fileheader;
+ namespace writer
+ {
+ class CreatorData;
+
+ /**
+ * The `Creator` is responsible to create a zim file.
+ *
+ * Once the `Creator` is instantiated, it can be configured with the
+ * `config*` methods.
+ * Then the creation process must be started with `startZimCreation`.
+ * Elements of the zim file can be added using the `add*` methods.
+ * The final steps is to call `finishZimCreation`.
+ *
+ * During the creation of the zim file (and before the call to `finishZimCreation`),
+ * some values must be set using the `set*` methods.
+ */
+ class Creator
+ {
+ public:
+
+ /**
+ * Creator constructor.
+ *
+ * @param verbose If the creator print verbose information.
+ * @param comptype The compression algorithm to use.
+ */
+ Creator();
+ virtual ~Creator();
+
+ /**
+ * Configure the verbosity of the creator
+ *
+ * @param verbose if the creator print verbose information.
+ * @return a reference to itself.
+ */
+ Creator& configVerbose(bool verbose);
+
+ /**
+ * Configure the compression algorithm to use.
+ *
+ * @param comptype the compression algorithm to use.
+ * @return a reference to itself.
+ */
+ Creator& configCompression(Compression compression);
+
+ /**
+ * Set the size of the created clusters.
+ *
+ * The creator will try to create cluster with (uncompressed) size
+ * as close as possible to targetSize without exceeding that limit.
+ * If not possible, the only such case being an item larger than targetSize,
+ * a separated cluster will be allocated for that oversized item.
+ *
+ * Be carefull with this value.
+ * Bigger value means more content put together, so a better compression ratio.
+ * But it means also that more decompression has to be made when reading a blob.
+ * If you don't know which value to put, don't use this method and let libzim
+ * use the default value.
+ *
+ * @param targetSize The target size of a cluster (in byte).
+ * @return a reference to itself.
+ */
+ Creator& configClusterSize(zim::size_type targetSize);
+
+ /**
+ * Configure the fulltext indexing feature.
+ *
+ * @param indexing True if we must fulltext index the content.
+ * @param language Language to use for the indexation.
+ * @return a reference to itself.
+ */
+ Creator& configIndexing(bool indexing, const std::string& language);
+
+ /**
+ * Set the number of thread to use for the internal worker.
+ *
+ * @param nbWorkers The number of workers to use.
+ * @return a reference to itself.
+ */
+ Creator& configNbWorkers(unsigned nbWorkers);
+
+ /**
+ * Start the zim creation.
+ *
+ * The creator must have been configured before calling this method.
+ *
+ * @param filepath the path of the zim file to create.
+ */
+ void startZimCreation(const std::string& filepath);
+
+ /**
+ * Add a item to the archive.
+ *
+ * @param item The item to add.
+ */
+ void addItem(std::shared_ptr<Item> item);
+
+ /**
+ * Add a metadata to the archive.
+ *
+ * @param name the name of the metadata
+ * @param content the content of the metadata
+ * @param mimetype the mimetype of the metadata.
+ * Only used to detect if the metadata must be compressed or not.
+ */
+ void addMetadata(const std::string& name, const std::string& content, const std::string& mimetype = "text/plain;charset=utf-8");
+
+ /**
+ * Add a metadata to the archive using a contentProvider instead of plain string.
+ *
+ * @param name the name of the metadata.
+ * @param provider the provider of the content of the metadata.
+ * @param mimetype the mimetype of the metadata.
+ * Only used to detect if the metadata must be compressed.
+ */
+ void addMetadata(const std::string& name, std::unique_ptr<ContentProvider> provider, const std::string& mimetype);
+
+ /**
+ * Add illustration to the archive.
+ *
+ * @param size the size (width and height) of the illustration.
+ * @param content the content of the illustration (must be a png content)
+ */
+ void addIllustration(unsigned int size, const std::string& content);
+
+ /**
+ * Add illustration to the archive.
+ *
+ * @param size the size (width and height) of the illustration.
+ * @param provider the provider of the content of the illustration (must be a png content)
+ */
+ void addIllustration(unsigned int size, std::unique_ptr<ContentProvider> provider);
+
+ /**
+ * Add a redirection to the archive.
+ *
+ * Hints (especially FRONT_ARTICLE) can be used to put the redirection
+ * in the front articles list.
+ * By default, redirections are not front article.
+ *
+ * @param path the path of the redirection.
+ * @param title the title of the redirection.
+ * @param targetpath the path of the target of the redirection.
+ * @param hints hints associated to the redirection.
+ */
+ void addRedirection(
+ const std::string& path,
+ const std::string& title,
+ const std::string& targetpath,
+ const Hints& hints = Hints());
+
+ /**
+ * Finalize the zim creation.
+ */
+ void finishZimCreation();
+
+ /**
+ * Set the path of the main page.
+ *
+ * @param mainPath The path of the main page.
+ */
+ void setMainPath(const std::string& mainPath) { m_mainPath = mainPath; }
+
+ /**
+ * Set the uuid of the the archive.
+ *
+ * @param uuid The uuid of the archive.
+ */
+ void setUuid(const zim::Uuid& uuid) { m_uuid = uuid; }
+
+ private:
+ std::unique_ptr<CreatorData> data;
+
+ // configuration
+ bool m_verbose = false;
+ Compression m_compression = Compression::Zstd;
+ bool m_withIndex = false;
+ size_t m_clusterSize;
+ std::string m_indexingLanguage;
+ unsigned m_nbWorkers = 4;
+
+ // zim data
+ std::string m_mainPath;
+ Uuid m_uuid = Uuid::generate();
+
+ void fillHeader(Fileheader* header) const;
+ void writeLastParts() const;
+ };
+ }
+
+}
+
+#endif // ZIM_WRITER_CREATOR_H
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_ITEM_H
+#define ZIM_WRITER_ITEM_H
+
+#include <stdexcept>
+#include <zim/blob.h>
+#include <zim/zim.h>
+#include <zim/uuid.h>
+#include <string>
+
+#include <map>
+
+namespace zim
+{
+ namespace writer
+ {
+ enum HintKeys {
+ COMPRESS,
+ FRONT_ARTICLE,
+ };
+ using Hints = std::map<HintKeys, uint64_t>;
+
+ class ContentProvider;
+ class IndexData {
+ public:
+ using GeoPosition = std::tuple<bool, double, double>;
+ virtual ~IndexData() = default;
+ virtual bool hasIndexData() const = 0;
+ virtual std::string getTitle() const = 0;
+ virtual std::string getContent() const = 0;
+ virtual std::string getKeywords() const = 0;
+ virtual uint32_t getWordCount() const = 0;
+ virtual GeoPosition getGeoPosition() const = 0;
+ };
+
+ /**
+ * Item represent data to be added to the archive.
+ *
+ * This is a abstract class the user need to implement.
+ * libzim provides `BasicItem`, `StringItem` and `FileItem`
+ * to simplify (or avoid) this reimplementation.
+ */
+ class Item
+ {
+ public:
+ /**
+ * The path of the item.
+ *
+ * The path must be absolute.
+ * Path must be unique.
+ *
+ * @return the path of the item.
+ */
+ virtual std::string getPath() const = 0;
+
+ /**
+ * The title of the item.
+ *
+ * Item's title is indexed and is used for the suggestion system.
+ * Title don't have to be unique.
+ *
+ * @return the title of the item.
+ */
+ virtual std::string getTitle() const = 0;
+
+ /**
+ * The mimetype of the item.
+ *
+ * Mimetype is store within the content.
+ * It is also used to detect if the content must be compressed or not.
+ *
+ * @return the mimetype of the item.
+ */
+ virtual std::string getMimeType() const = 0;
+
+ /**
+ * The content provider of the item.
+ *
+ * The content provider is responsible to provide the content to the creator.
+ * The returned content provider must stay valid even after creator release
+ * its reference to the item.
+ *
+ * This method will be called once by libzim, in the main thread
+ * (but will be used in a different thread).
+ * The default IndexData will also call this method once (more)
+ * in the main thread (and use it in another thread).
+ *
+ * @return the contentProvider of the item.
+ */
+ virtual std::unique_ptr<ContentProvider> getContentProvider() const = 0;
+
+ /**
+ * The index data of the item.
+ *
+ * The index data is the data to index. (May be different from the content
+ * to store).
+ * The returned index data must stay valid even after creator release
+ * its reference to the item.
+ * This method will be called once by libzim if it is compiled with xapian
+ * (and is configured to index data).
+ *
+ * The returned IndexData will be used as source to index the item.
+ * If you don't want the item to be indexed, you can return a nullptr here
+ * or return a valid IndexData pointer which will return false to `hasIndexData`.
+ *
+ * If you don't implement this method, a default implementation will be used.
+ * The default implementation first checks for the mimetype and if the mimetype
+ * contains `text/html` it will use a contentProvider to get the content to index.
+ * The contentProvider will be created in the main thread but the data reading and
+ * parsing will occur in a different thread.
+ *
+ * @return the indexData of the item.
+ * May return a nullptr if there is no indexData.
+ */
+ virtual std::shared_ptr<IndexData> getIndexData() const;
+
+ /**
+ * Hints to help the creator takes decision about the item.
+ *
+ * For now two hints are supported:
+ * - COMPRESS: Can be used to force the creator to put the item content
+ * in a compressed cluster (if true) or not (if false).
+ * If the hint is not provided, the decision is taken based on the
+ * mimetype (textual or binary content ?)
+ * - FRONT_ARTICLE: Can (Should) be used to specify if the item is
+ * a front article or not.
+ * If the hint is not provided, the decision is taken based on the
+ * mimetype (html or not ?)
+ *
+ * @return A list of hints.
+ */
+ virtual Hints getHints() const;
+
+ /**
+ * Returns the getHints() amended with default values based on mimetypes.
+ */
+ Hints getAmendedHints() const;
+ virtual ~Item() = default;
+ };
+
+ /**
+ * A BasicItem is a partial implementation of a Item.
+ *
+ * `BasicItem` provides a basic implementation for everything about an `Item`
+ * but the actual content of the item.
+ */
+ class BasicItem : public Item
+ {
+ public:
+ /**
+ * Create a BasicItem with the given path, mimetype and title.
+ *
+ * @param path the path of the item.
+ * @param mimetype the mimetype of the item.
+ * @param title the title of the item.
+ */
+ BasicItem(const std::string& path, const std::string& mimetype, const std::string& title, Hints hints)
+ : path(path),
+ mimetype(mimetype),
+ title(title),
+ hints(hints)
+ {}
+
+ std::string getPath() const { return path; }
+ std::string getTitle() const { return title; }
+ std::string getMimeType() const { return mimetype; }
+ Hints getHints() const { return hints; }
+
+ protected:
+ std::string path;
+ std::string mimetype;
+ std::string title;
+ Hints hints;
+ };
+
+ /**
+ * A `StringItem` is a full implemented item where the content is stored in a string.
+ */
+ class StringItem : public BasicItem, public std::enable_shared_from_this<StringItem>
+ {
+ public:
+ /**
+ * Create a StringItem with the given path, mimetype, title and content.
+ *
+ * The parameters are the ones of the private constructor.
+ *
+ * @param path the path of the item.
+ * @param mimetype the mimetype of the item.
+ * @param title the title of the item.
+ * @param content the content of the item.
+ */
+ template<typename... Ts>
+ static std::shared_ptr<StringItem> create(Ts&&... params) {
+ return std::shared_ptr<StringItem>(new StringItem(std::forward<Ts>(params)...));
+ }
+
+ std::unique_ptr<ContentProvider> getContentProvider() const;
+
+ protected:
+ std::string content;
+
+ private:
+ StringItem(const std::string& path, const std::string& mimetype,
+ const std::string& title, Hints hints, const std::string& content)
+ : BasicItem(path, mimetype, title, hints),
+ content(content)
+ {}
+
+
+
+ };
+
+ /**
+ * A `FileItem` is a full implemented item where the content is file.
+ */
+ class FileItem : public BasicItem
+ {
+ public:
+ /**
+ * Create a FileItem with the given path, mimetype, title and filenpath.
+ *
+ * @param path the path of the item.
+ * @param mimetype the mimetype of the item.
+ * @param title the title of the item.
+ * @param filepath the path of the file in the filesystem.
+ */
+ FileItem(const std::string& path, const std::string& mimetype,
+ const std::string& title, Hints hints, const std::string& filepath)
+ : BasicItem(path, mimetype, title, hints),
+ filepath(filepath)
+ {}
+
+ std::unique_ptr<ContentProvider> getContentProvider() const;
+
+ protected:
+ std::string filepath;
+ };
+
+ }
+}
+
+#endif // ZIM_WRITER_ITEM_H
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Veloman Yunkan
+ * Copyright (C) 2018-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ZIM_H
+#define ZIM_ZIM_H
+
+#include <cstdint>
+
+#ifdef __GNUC__
+#define DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#define DEPRECATED __declspec(deprecated)
+#else
+#praga message("WARNING: You need to implement DEPRECATED for this compiler")
+#define DEPRECATED
+#endif
+
+
+#include <zim/zim_config.h>
+
+namespace zim
+{
+ // An index of an entry (in a zim file)
+ typedef uint32_t entry_index_type;
+
+ // An index of an cluster (in a zim file)
+ typedef uint32_t cluster_index_type;
+
+ // An index of a blog (in a cluster)
+ typedef uint32_t blob_index_type;
+
+ // The size of something (entry, zim, cluster, blob, ...)
+ typedef uint64_t size_type;
+
+ // An offset.
+ typedef uint64_t offset_type;
+
+ enum class Compression
+ {
+ None = 1,
+ Lzma = 4,
+ Zstd = 5
+ };
+
+ static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate";
+
+ enum class IntegrityCheck
+ {
+ CHECKSUM,
+ DIRENT_PTRS, // Checks that offsets in UrlPtrList are valid
+ DIRENT_ORDER, // Checks that dirents are properly sorted
+ TITLE_INDEX, // Checks that entries in the title index are valid
+ // and properly sorted
+ CLUSTER_PTRS, // Checks that offsets in ClusterPtrList are valid
+ DIRENT_MIMETYPES, // Checks that mime-type values in dirents are valid
+
+ // This must be the last one and denotes the count of all checks
+ COUNT
+ };
+}
+
+#endif // ZIM_ZIM_H
+
--- /dev/null
+project('libzim', ['c', 'cpp'],
+ version : '7.2.0',
+ license : 'GPL2',
+ default_options : ['c_std=c11', 'cpp_std=c++11'])
+
+if build_machine.system() != 'windows'
+ add_project_arguments('-D_LARGEFILE64_SOURCE=1', '-D_FILE_OFFSET_BITS=64', language: 'cpp')
+endif
+
+cpp = meson.get_compiler('cpp')
+sizeof_off_t = cpp.sizeof('off_t')
+sizeof_size_t = cpp.sizeof('size_t')
+
+private_conf = configuration_data()
+public_conf = configuration_data()
+
+private_conf.set('VERSION', '"@0@"'.format(meson.project_version()))
+public_conf.set('LIBZIM_VERSION', '"@0@"'.format(meson.project_version()))
+private_conf.set('DIRENT_CACHE_SIZE', get_option('DIRENT_CACHE_SIZE'))
+private_conf.set('DIRENT_LOOKUP_CACHE_SIZE', get_option('DIRENT_LOOKUP_CACHE_SIZE'))
+private_conf.set('CLUSTER_CACHE_SIZE', get_option('CLUSTER_CACHE_SIZE'))
+private_conf.set('LZMA_MEMORY_SIZE', get_option('LZMA_MEMORY_SIZE'))
+private_conf.set10('MMAP_SUPPORT_64', sizeof_off_t==8)
+private_conf.set10('ENV64BIT', sizeof_size_t==8)
+private_conf.set10('ENV32BIT', sizeof_size_t==4)
+if target_machine.system() == 'windows'
+ private_conf.set('ENABLE_USE_MMAP', false)
+ add_project_arguments('-DNOMINMAX', language: 'cpp')
+else
+ private_conf.set('ENABLE_USE_MMAP', get_option('USE_MMAP'))
+endif
+private_conf.set('ENABLE_USE_BUFFER_HEADER', get_option('USE_BUFFER_HEADER'))
+
+static_linkage = get_option('static-linkage')
+static_linkage = static_linkage or get_option('default_library')=='static'
+
+lzma_dep = dependency('liblzma', static:static_linkage)
+if static_linkage
+ add_project_arguments('-DLZMA_API_STATIC', language: 'cpp')
+endif
+
+zstd_dep = dependency('libzstd', static:static_linkage)
+
+if target_machine.system() == 'freebsd'
+ execinfo_dep = cpp.find_library('execinfo')
+endif
+
+if get_option('with_xapian')
+ xapian_dep = dependency('xapian-core', static:static_linkage)
+else
+ xapian_dep = dependency('', required:false)
+endif
+private_conf.set('ENABLE_XAPIAN', xapian_dep.found())
+public_conf.set('LIBZIM_WITH_XAPIAN', xapian_dep.found())
+
+pkg_requires = ['liblzma', 'libzstd']
+if build_machine.system() == 'windows'
+ extra_link_args = ['-lRpcrt4', '-lWs2_32', '-lwinmm', '-licuuc', '-licuin']
+ extra_cpp_args = ['-DSORTPP_PASS']
+else
+ extra_link_args = []
+ extra_cpp_args = []
+endif
+
+compiler = meson.get_compiler('cpp')
+if (compiler.get_id() == 'gcc' and build_machine.system() == 'linux') or target_machine.system() == 'freebsd'
+ # C++ std::thread is implemented using pthread on linux by gcc
+ thread_dep = dependency('threads')
+else
+ thread_dep = dependency('', required:false)
+endif
+
+if xapian_dep.found()
+ pkg_requires += ['xapian-core']
+ icu_dep = dependency('icu-i18n', static:static_linkage)
+ pkg_requires += ['icu-i18n']
+else
+ icu_dep = dependency('icu-i18n', required:false, static:static_linkage)
+endif
+
+gtest_dep = dependency('gtest', main:true, fallback:['gtest', 'gtest_main_dep'], required:false)
+
+inc = include_directories('include')
+
+subdir('include')
+subdir('scripts')
+subdir('static')
+subdir('src')
+subdir('examples')
+subdir('test')
+if get_option('doc')
+ subdir('docs')
+endif
+
+pkg_mod = import('pkgconfig')
+pkg_mod.generate(libraries : libzim,
+ version : meson.project_version(),
+ name : 'libzim',
+ filebase : 'libzim',
+ description : 'A Library to read/write ZIM files.',
+ requires : pkg_requires)
--- /dev/null
+option('CLUSTER_CACHE_SIZE', type : 'string', value : '16',
+ description : 'set cluster cache size to number (default:16)')
+option('DIRENT_CACHE_SIZE', type : 'string', value : '512',
+ description : 'set dirent cache size to number (default:512)')
+option('DIRENT_LOOKUP_CACHE_SIZE', type : 'string', value : '1024',
+ description : 'set dirent lookup cache size to number (default:1024)')
+option('LZMA_MEMORY_SIZE', type : 'string', value : '128',
+ description : 'set lzma uncompress memory in MB (default:128)')
+option('USE_MMAP', type: 'boolean', value: true,
+ description: 'Use mmap to avoid copy from file. (default:true, always false on windows)')
+option('USE_BUFFER_HEADER', type: 'boolean', value: true,
+ description: '''Copy (or use mmap) header index buffers. (default:true)
+Header index are used to access articles, having them in memory can improve access speed but on low memory devices it may use to many memory.
+If false, we directly read the index in the file at each article access.''')
+option('static-linkage', type : 'boolean', value : false,
+ description : 'Link statically with the dependencies.')
+option('doc', type : 'boolean', value : false,
+ description : 'Build the documentations.')
+option('with_xapian', type : 'boolean', value: true,
+ description: 'Build libzim with xapian support')
+option('test_data_dir', type : 'string', value: '',
+ description: 'Where the test data are. If not set, meson will use a internal directory in the build dir. If you want to download the data in the specified directory you can use `meson download_test_data`. As a special value, you can pass `none` to deactivate test using external test data.')
--- /dev/null
+#!/usr/bin/env python3
+
+'''
+Copyright 2021 Matthieu Gautier <mgautier@kymeria.fr>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or any
+later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301, USA.
+'''
+
+import argparse
+from pathlib import Path
+from urllib import request
+from urllib.error import *
+import tarfile
+import sys
+
+TEST_DATA_VERSION = "0.3"
+ARCHIVE_URL_TEMPL = "https://github.com/openzim/zim-testing-suite/releases/download/v{version}/zim-testing-suite-{version}.tar.gz"
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--version', '-v',
+ help="The version to download.",
+ default=TEST_DATA_VERSION)
+ parser.add_argument('--remove-top-dir',
+ help="Remove the top directory when extracting",
+ action='store_true')
+ parser.add_argument('outdir',
+ help='The directory where to install the test data.')
+ args = parser.parse_args()
+
+ test_data_url = ARCHIVE_URL_TEMPL.format(version=args.version)
+
+ try:
+ with request.urlopen(test_data_url) as f:
+ with tarfile.open(fileobj=f, mode="r|*") as archive:
+ while True:
+ member = archive.next()
+ if member is None:
+ break
+ if args.remove_top_dir:
+ member.name = '/'.join(member.name.split('/')[1:])
+ archive.extract(member, path=args.outdir)
+
+ except HTTPError as e:
+ print("Error downloading archive at url : {}".format(test_data_url))
+ print(e)
+ sys.exit(1)
+ except OSError as e:
+ print("Error writing the test data on the file system.")
+ print(e)
+ sys.exit(1)
--- /dev/null
+#!/usr/bin/env python3
+
+'''
+Copyright 2016 Matthieu Gautier <mgautier@kymeria.fr>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or any
+later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301, USA.
+'''
+
+import argparse
+import os.path
+import re
+
+def full_identifier(filename):
+ parts = os.path.normpath(filename).split(os.sep)
+ parts = [to_identifier(part) for part in parts]
+ print(filename, parts)
+ return parts
+
+def to_identifier(name):
+ ident = re.sub(r'[^0-9a-zA-Z]', '_', name)
+ if ident[0].isnumeric():
+ return "_"+ident
+ return ident
+
+resource_impl_template = """
+static const unsigned char {data_identifier}[] = {{
+ {resource_content}
+}};
+
+namespace RESOURCE {{
+{namespaces_open}
+const std::string {identifier} = init_resource("{env_identifier}", {data_identifier}, {resource_len});
+{namespaces_close}
+}}
+"""
+
+resource_getter_template = """
+ if (name == "{common_name}")
+ return RESOURCE::{identifier};
+"""
+
+resource_decl_template = """{namespaces_open}
+extern const std::string {identifier};
+{namespaces_close}"""
+
+class Resource:
+ def __init__(self, base_dirs, filename):
+ filename = filename.strip()
+ self.filename = filename
+ self.identifier = full_identifier(filename)
+ found = False
+ for base_dir in base_dirs:
+ try:
+ with open(os.path.join(base_dir, filename), 'rb') as f:
+ self.data = f.read()
+ found = True
+ break
+ except FileNotFoundError:
+ continue
+ if not found:
+ raise Exception("Impossible to found {}".format(filename))
+
+ def dump_impl(self):
+ nb_row = len(self.data)//16 + (1 if len(self.data) % 16 else 0)
+ sliced = (self.data[i*16:(i+1)*16] for i in range(nb_row))
+
+ return resource_impl_template.format(
+ data_identifier="_".join([""]+self.identifier),
+ resource_content=",\n ".join(", ".join("{:#04x}".format(i) for i in r) for r in sliced),
+ resource_len=len(self.data),
+ namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]),
+ namespaces_close=" ".join(["}"]*(len(self.identifier)-1)),
+ identifier=self.identifier[-1],
+ env_identifier="RES_"+"_".join(self.identifier)+"_PATH"
+ )
+
+ def dump_getter(self):
+ return resource_getter_template.format(
+ common_name=self.filename,
+ identifier="::".join(self.identifier)
+ )
+
+ def dump_decl(self):
+ return resource_decl_template.format(
+ namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]),
+ namespaces_close=" ".join(["}"]*(len(self.identifier)-1)),
+ identifier=self.identifier[-1]
+ )
+
+
+
+master_c_template = """//This file is automaically generated. Do not modify it.
+
+#include <stdlib.h>
+#include <fstream>
+#include "{include_file}"
+
+static std::string init_resource(const char* name, const unsigned char* content, int len)
+{{
+ char * resPath = getenv(name);
+ if (NULL == resPath)
+ return std::string(reinterpret_cast<const char*>(content), len);
+
+ std::ifstream ifs(resPath);
+ if (!ifs.good())
+ return std::string(reinterpret_cast<const char*>(content), len);
+ return std::string( (std::istreambuf_iterator<char>(ifs)),
+ (std::istreambuf_iterator<char>() ));
+}}
+
+const std::string& getResource_{basename}(const std::string& name) {{
+{RESOURCES_GETTER}
+ throw ResourceNotFound("Resource not found.");
+}}
+
+{RESOURCES}
+
+"""
+
+def gen_c_file(resources, basename):
+ return master_c_template.format(
+ RESOURCES="\n\n".join(r.dump_impl() for r in resources),
+ RESOURCES_GETTER="\n\n".join(r.dump_getter() for r in resources),
+ include_file=basename,
+ basename=to_identifier(basename)
+ )
+
+
+
+master_h_template = """//This file is automaically generated. Do not modify it.
+#ifndef KIWIX_{BASENAME}
+#define KIWIX_{BASENAME}
+
+#include <string>
+#include <stdexcept>
+
+namespace RESOURCE {{
+ {RESOURCES}
+}};
+
+class ResourceNotFound : public std::runtime_error {{
+ public:
+ ResourceNotFound(const std::string& what_arg):
+ std::runtime_error(what_arg)
+ {{ }};
+}};
+
+const std::string& getResource_{basename}(const std::string& name);
+
+#define getResource(a) (getResource_{basename}(a))
+
+#endif // KIWIX_{BASENAME}
+
+"""
+
+def gen_h_file(resources, basename):
+ return master_h_template.format(
+ RESOURCES="\n ".join(r.dump_decl() for r in resources),
+ BASENAME=basename.upper(),
+ basename=basename,
+ )
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--cxxfile',
+ help='The Cpp file name to generate')
+ parser.add_argument('--hfile',
+ help='The h file name to generate')
+ parser.add_argument('--source_dir',
+ help="Additional directory where to look for resources.",
+ action='append')
+ parser.add_argument('resource_file',
+ help='The list of resources to compile.')
+ args = parser.parse_args()
+
+ base_dir = os.path.dirname(os.path.realpath(args.resource_file))
+ source_dir = args.source_dir or []
+ with open(args.resource_file, 'r') as f:
+ resources = [Resource([base_dir]+source_dir, filename)
+ for filename in f.readlines()]
+
+ h_identifier = to_identifier(os.path.basename(args.hfile))
+ with open(args.hfile, 'w') as f:
+ f.write(gen_h_file(resources, h_identifier))
+
+ with open(args.cxxfile, 'w') as f:
+ f.write(gen_c_file(resources, os.path.basename(args.hfile)))
+
--- /dev/null
+
+res_compiler = find_program('libzim-compile-resources')
+test_data_downloader = find_program('download_test_data.py')
--- /dev/null
+/*
+ * Copyright (C) 2018-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yankan
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_DIRENT_H
+#define ZIM_DIRENT_H
+
+#include <string>
+#include <zim/zim.h>
+#include <exception>
+#include <memory>
+
+#include "zim_types.h"
+#include "debug.h"
+
+namespace zim
+{
+ class Buffer;
+ class InvalidSize : public std::exception {};
+ class Dirent
+ {
+ protected:
+ uint16_t mimeType;
+
+ uint32_t version;
+
+ cluster_index_t clusterNumber; // only used when redirect is false
+ blob_index_t blobNumber; // only used when redirect is false
+
+ entry_index_t redirectIndex; // only used when redirect is true
+
+ char ns;
+ std::string title;
+ std::string url;
+ std::string parameter;
+
+ public:
+ // these constants are put into mimeType field
+ static const uint16_t redirectMimeType = 0xffff;
+ static const uint16_t linktargetMimeType = 0xfffe;
+ static const uint16_t deletedMimeType = 0xfffd;
+
+ Dirent()
+ : mimeType(0),
+ version(0),
+ clusterNumber(0),
+ blobNumber(0),
+ redirectIndex(0),
+ ns('\0')
+ {}
+
+ bool isRedirect() const { return mimeType == redirectMimeType; }
+ bool isLinktarget() const { return mimeType == linktargetMimeType; }
+ bool isDeleted() const { return mimeType == deletedMimeType; }
+ bool isArticle() const { return !isRedirect() && !isLinktarget() && !isDeleted(); }
+ uint16_t getMimeType() const { return mimeType; }
+
+ uint32_t getVersion() const { return version; }
+ void setVersion(uint32_t v) { version = v; }
+
+ cluster_index_t getClusterNumber() const { return isRedirect() ? cluster_index_t(0) : clusterNumber; }
+ blob_index_t getBlobNumber() const { return isRedirect() ? blob_index_t(0) : blobNumber; }
+
+ entry_index_t getRedirectIndex() const { return isRedirect() ? redirectIndex : entry_index_t(0); }
+
+ char getNamespace() const { return ns; }
+ const std::string& getTitle() const { return title.empty() ? url : title; }
+ const std::string& getUrl() const { return url; }
+ std::string getLongUrl() const;
+ const std::string& getParameter() const { return parameter; }
+
+ size_t getDirentSize() const
+ {
+ size_t ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2;
+ if (title != url)
+ ret += title.size();
+ return ret;
+ }
+
+ void setTitle(const std::string& title_)
+ {
+ title = title_;
+ }
+
+ void setUrl(char ns_, const std::string& url_)
+ {
+ ns = ns_;
+ url = url_;
+ }
+
+ void setParameter(const std::string& parameter_)
+ {
+ parameter = parameter_;
+ }
+
+ void setRedirect(entry_index_t idx)
+ {
+ redirectIndex = idx;
+ mimeType = redirectMimeType;
+ }
+
+ void setItem(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_)
+ {
+ mimeType = mimeType_;
+ clusterNumber = clusterNumber_;
+ blobNumber = blobNumber_;
+ }
+ };
+}
+
+#endif // ZIM_DIRENT_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2020-2021 Veloman Yunkan
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+#include <zim/archive.h>
+#include <zim/entry.h>
+#include <zim/item.h>
+#include <zim/error.h>
+#include "fileimpl.h"
+#include "tools.h"
+#include "log.h"
+
+log_define("zim.archive")
+
+namespace zim
+{
+ Archive::Archive(const std::string& fname)
+ : m_impl(new FileImpl(fname))
+ { }
+
+#ifndef _WIN32
+ Archive::Archive(int fd)
+ : m_impl(new FileImpl(fd))
+ { }
+
+ Archive::Archive(int fd, offset_type offset, size_type size)
+ : m_impl(new FileImpl(fd, offset_t(offset), zsize_t(size)))
+ { }
+#endif
+
+ const std::string& Archive::getFilename() const
+ {
+ return m_impl->getFilename();
+ }
+
+ size_type Archive::getFilesize() const
+ {
+ return m_impl->getFilesize().v;
+ }
+
+ entry_index_type Archive::getAllEntryCount() const
+ {
+ return m_impl->getCountArticles().v;
+ }
+
+ entry_index_type Archive::getEntryCount() const
+ {
+ return m_impl->getUserEntryCount().v;
+ }
+
+ entry_index_type Archive::getArticleCount() const
+ {
+ if (m_impl->hasFrontArticlesIndex()) {
+ return m_impl->getFrontEntryCount().v;
+ } else if (m_impl->hasNewNamespaceScheme()) {
+ return m_impl->getNamespaceEntryCount('C').v;
+ } else {
+ return m_impl->getNamespaceEntryCount('A').v;
+ }
+ }
+
+ Uuid Archive::getUuid() const
+ {
+ return m_impl->getFileheader().getUuid();
+ }
+
+ Item Archive::getMetadataItem(const std::string& name) const
+ {
+ auto r = m_impl->findx('M', name);
+ if (!r.first) {
+ throw EntryNotFound("Cannot find metadata");
+ }
+ auto entry = Entry(m_impl, entry_index_type(r.second));
+ return entry.getItem(true);
+ }
+
+ std::string Archive::getMetadata(const std::string& name) const
+ {
+ auto item = getMetadataItem(name);
+ return item.getData();
+ }
+
+ std::vector<std::string> Archive::getMetadataKeys() const {
+ std::vector<std::string> ret;
+ auto start = m_impl->getNamespaceBeginOffset('M');
+ auto end = m_impl->getNamespaceEndOffset('M');
+ for (auto idx=start; idx!=end; idx++) {
+ auto dirent = m_impl->getDirent(idx);
+ ret.push_back(dirent->getUrl());
+ }
+ return ret;
+ }
+
+ zim::FileImpl::FindxResult findFavicon(FileImpl& impl)
+ {
+ for(auto ns:{'-', 'I'}) {
+ for (auto& path:{"favicon", "favicon.png"}) {
+ auto r = impl.findx(ns, path);
+ if (r.first) {
+ return r;
+ }
+ }
+ }
+ throw EntryNotFound("No favicon found.");
+ }
+
+ Item Archive::getIllustrationItem(unsigned int size) const {
+ std::ostringstream ss;
+ ss << "Illustration_" << size << "x" << size << "@" << 1;
+ auto r = m_impl->findx('M', ss.str());
+ if (r.first) {
+ return getEntryByPath(entry_index_type(r.second)).getItem();
+ }
+ // We haven't found the exact entry. Let's "search" for a illustration and
+ // use the first one we found.
+#if 0
+ // We have decided to not implement fallback in case of wrong resolution for now.
+ // We keep this code for reference.
+ r = m_impl->findx('M', "Illustration");
+ auto entry = getEntryByPath(entry_index_type(r.second));
+ if (entry.getPath().find("Illustration") == 0) {
+ return entry.getItem();
+ }
+#endif
+ // For 48x48 illustration, return favicon for older zims.
+ if (size == 48) {
+ auto r = findFavicon(*m_impl);
+ return getEntryByPath(entry_index_type(r.second)).getItem(true);
+ }
+ throw EntryNotFound("Cannot find illustration item.");
+ }
+
+ std::set<unsigned int> Archive::getIllustrationSizes() const {
+ std::set<unsigned int> ret;
+ for(auto r = m_impl->findx('M', "Illustration_").second;
+ /*No exit test*/;
+ r++
+ ) {
+ try {
+ auto path = getEntryByPath(entry_index_type(r)).getPath();
+ if (path.find("Illustration_") != 0) {
+ break;
+ }
+ try {
+ ret.insert(parseIllustrationPathToSize(path));
+ } catch (...) {}
+ } catch (const std::out_of_range& e) {
+ break;
+ }
+ }
+ if (ret.find(48) == ret.end()) {
+ try {
+ // raise a exception if we cannot find the (old format) favicon.
+ findFavicon(*m_impl);
+ ret.insert(48);
+ } catch(EntryNotFound&) {}
+ }
+ return ret;
+ }
+
+ bool Archive::hasIllustration(unsigned int size) const {
+ try {
+ getIllustrationItem(size);
+ return true;
+ } catch (EntryNotFound& e) {
+ return false;
+ }
+ }
+
+ Entry Archive::getEntryByPath(entry_index_type idx) const
+ {
+ if (idx >= entry_index_type(m_impl->getCountArticles()))
+ throw std::out_of_range("entry index out of range");
+ return Entry(m_impl, idx);
+ }
+
+ Entry Archive::getEntryByPath(const std::string& path) const
+ {
+ if (m_impl->hasNewNamespaceScheme()) {
+ // Get path in user content.
+ auto r = m_impl->findx('C', path);
+ if (r.first) {
+ return Entry(m_impl, entry_index_type(r.second));
+ }
+ try {
+ // Path may come from a already stored from a old zim archive (bookmark),
+ // and so contains a namespace.
+ // We have to adapt the path to use the C namespace.
+ r = m_impl->findx('C', std::get<1>(parseLongPath(path)));
+ if (r.first) {
+ return Entry(m_impl, entry_index_type(r.second));
+ }
+ } catch (std::runtime_error&) {}
+ } else {
+ // Path should contains the namespace.
+ auto r = m_impl->findx(path);
+ if (r.first) {
+ return Entry(m_impl, entry_index_type(r.second));
+ }
+ // If not (bookmark) from a recent zim archive.
+ for (auto ns:{'A', 'I', 'J', '-'}) {
+ r = m_impl->findx(ns, path);
+ if (r.first) {
+ return Entry(m_impl, entry_index_type(r.second));
+ }
+ }
+ }
+
+ throw EntryNotFound("Cannot find entry");
+ }
+
+ Entry Archive::getEntryByTitle(entry_index_type idx) const
+ {
+ return Entry(m_impl, entry_index_type(m_impl->getIndexByTitle(title_index_t(idx))));
+ }
+
+ Entry Archive::getEntryByTitle(const std::string& title) const
+ {
+ for (auto ns:{'C', 'A', 'I', 'J', '-'}) {
+ log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')');
+ auto r = m_impl->findxByTitle(ns, title);
+ if (r.first)
+ return getEntryByTitle(entry_index_type(r.second));
+ }
+ throw EntryNotFound("Cannot find entry");
+ }
+
+ Entry Archive::getEntryByClusterOrder(entry_index_type idx) const
+ {
+ return Entry(m_impl, entry_index_type(m_impl->getIndexByClusterOrder(entry_index_t(idx))));
+ }
+
+ Entry Archive::getMainEntry() const {
+ auto r = m_impl->findx('W', "mainPage");
+ if (r.first) {
+ return getEntryByPath(entry_index_type(r.second));
+ }
+ auto& header = m_impl->getFileheader();
+ if (!header.hasMainPage()) {
+ throw EntryNotFound("No main page");
+ }
+ return getEntryByPath(header.getMainPage());
+ }
+
+ bool Archive::hasMainEntry() const {
+ return m_impl->getFileheader().hasMainPage();
+ }
+
+ Entry Archive::getRandomEntry() const {
+ if ( !m_impl->hasNewNamespaceScheme() ) {
+ const auto startOfNamespaceA = m_impl->getNamespaceBeginOffset('A');
+ const auto endOfNamespaceA = m_impl->getNamespaceEndOffset('A');
+ const auto n = (endOfNamespaceA - startOfNamespaceA).v;
+ if ( n == 0 ) {
+ throw EntryNotFound("Cannot find valid random entry (empty namespace 'A'");
+ }
+ return getEntryByPath(startOfNamespaceA.v + randomNumber(n-1));
+ } else {
+ auto frontEntryCount = m_impl->getFrontEntryCount().v;
+ if (frontEntryCount == 0) {
+ throw EntryNotFound("Cannot find valid random entry (no front entry at all)");
+ }
+
+ return getEntryByTitle(randomNumber(frontEntryCount-1));
+ }
+ }
+
+ bool Archive::hasFulltextIndex() const {
+ auto r = m_impl->findx('X', "fulltext/xapian");
+ if (!r.first) {
+ r = m_impl->findx('Z', "/fulltextIndex/xapian");
+ }
+ if (!r.first) {
+ return false;
+ }
+ auto entry = Entry(m_impl, entry_index_type(r.second));
+ auto item = entry.getItem(true);
+ auto accessInfo = item.getDirectAccessInformation();
+ return accessInfo.second;
+ }
+
+ bool Archive::hasTitleIndex() const {
+ auto r = m_impl->findx('X', "title/xapian");
+ if (!r.first) {
+ return false;
+ }
+ auto entry = Entry(m_impl, entry_index_type(r.second));
+ auto item = entry.getItem(true);
+ auto accessInfo = item.getDirectAccessInformation();
+ return accessInfo.second;
+ }
+
+ Archive::EntryRange<EntryOrder::pathOrder> Archive::iterByPath() const
+ {
+ return EntryRange<EntryOrder::pathOrder>(m_impl, m_impl->getStartUserEntry().v, m_impl->getEndUserEntry().v);
+ }
+
+ Archive::EntryRange<EntryOrder::titleOrder> Archive::iterByTitle() const
+ {
+ if (m_impl->hasFrontArticlesIndex()) {
+ // We have a front articles index. We can "simply" loop over all front entries.
+ return EntryRange<EntryOrder::titleOrder>(
+ m_impl,
+ 0,
+ m_impl->getFrontEntryCount().v
+ );
+ } else if (!m_impl->hasNewNamespaceScheme()) {
+ // We are a old zim archive with namespace, we have to iterate on 'A' namespace.
+ return EntryRange<EntryOrder::titleOrder>(
+ m_impl,
+ m_impl->getNamespaceBeginOffset('A').v,
+ m_impl->getNamespaceEndOffset('A').v
+ );
+ } else {
+ // We are a zim archive without namespace but without specific articles listing.
+ // We don't the choice here, iterate on all user entries.
+ return EntryRange<EntryOrder::titleOrder>(
+ m_impl,
+ m_impl->getStartUserEntry().v,
+ m_impl->getEndUserEntry().v
+ );
+ }
+ }
+
+ Archive::EntryRange<EntryOrder::efficientOrder> Archive::iterEfficient() const
+ {
+ return EntryRange<EntryOrder::efficientOrder>(m_impl, 0, getEntryCount());
+ }
+
+ Archive::EntryRange<EntryOrder::pathOrder> Archive::findByPath(std::string path) const
+ {
+ // "url order" means that the entries are stored by long url ("NS/url)".
+ //
+ // If we really want to search by url whatever is the namespace, we would have to
+ // search in all "content" (A, I, J, -) namespaces and then merge the results.
+ //
+ // It would be pretty complex as we would need to have iterate hover several ranges
+ // in the same time. Let's enforce that path is the full path and search in whatever
+ // namespace is in it.
+
+ // We have to return two iterator for a range of entry where `path` is a prefix.
+ // - The begin iterator is a iterator to the first entry with `path` as a prefix (or (range) end if none)
+ // - The end iterator is the iterator pass the last entry with `path` as a prefix (or (global) end)
+ //
+ // The findx return a iterator for the exact match or the one just after.
+ // So, for the begin iterator, we can simply use the index returned by findx
+ // For the end iterator we have to do the same but with a prefix "just after" the queried `path`
+ // So the end index will always be just after the prefix range. If there is no prefix range, both
+ // begin and end will be just after where it would be.
+ //
+ // Suposing a list of title :
+ // 0. aaaaaa
+ // 1. aaaaab
+ // 2. aabbaa
+ // 3. aabbbb
+ // 4. bbaaaa
+ // 5. bbbb
+ // 6. bbbbaa
+ // 7. bbbbbb
+ // 8. <past the end>
+
+ // If we search for prefix aabb, we must return 2/4
+ // A findx with aabb will return 2
+ // A findx with aabc will return 4
+ //
+ // If we search for prefix bbbb, we must return 5/8
+ // A findx with bbbb will return 5 (with exact match)
+ // A findx with bbbc will return 8
+ //
+ // If we search for prefix cccc, we must return 8/8
+ // A findx with cccc will return 8
+ // A findx with bbbc will return 8
+ //
+ // If we search for prefix a, we must return 0/4
+ // A findx with a will return 0
+ // A find with b will return 4
+ entry_index_t begin_idx, end_idx;
+ if (path.empty() || path == "/") {
+ begin_idx = m_impl->getStartUserEntry();
+ end_idx = m_impl->getEndUserEntry();
+ } else if (m_impl->hasNewNamespaceScheme()) {
+ begin_idx = m_impl->findx('C', path).second;
+ path.back()++;
+ end_idx = m_impl->findx('C', path).second;
+ } else {
+ char ns;
+ try {
+ std::tie(ns, path) = parseLongPath(path);
+ } catch (...) {
+ return Archive::EntryRange<EntryOrder::pathOrder>(m_impl, 0, 0);
+ }
+ begin_idx = m_impl->findx(ns, path).second;
+ if (path.empty()) {
+ ns++;
+ } else {
+ path.back()++;
+ }
+ end_idx = m_impl->findx(ns, path).second;
+ }
+ return Archive::EntryRange<EntryOrder::pathOrder>(m_impl, begin_idx.v, end_idx.v);
+ }
+
+ Archive::EntryRange<EntryOrder::titleOrder> Archive::findByTitle(std::string title) const
+ {
+ // "title order" means that the entries are stored by "NS/title" part.
+ // It is nice when we want to search for title in a specific namespace, but
+ // now we want to hide the namespace. It would be better if the "title order"
+ // would be real title order, whatever is the namespace.
+ //
+ // If we really want to search by title what ever is the namespace, we would have to
+ // search in all "content" namespace and then merge the results.
+ //
+ // The find by title is only used for the article (`A` namespace). So let's search
+ // only in it.
+
+ // See `Archive::findByPath` for the rational.
+ auto ns = m_impl->hasNewNamespaceScheme() ? 'C' : 'A';
+ auto begin_idx = m_impl->findxByTitle(ns, title).second;
+ title.back()++;
+ auto end_idx = m_impl->findxByTitle(ns, title).second;
+ return Archive::EntryRange<EntryOrder::titleOrder>(m_impl, begin_idx.v, end_idx.v);
+ }
+
+ bool Archive::hasChecksum() const
+ {
+ return m_impl->getFileheader().hasChecksum();
+ }
+
+ std::string Archive::getChecksum() const
+ {
+ return m_impl->getChecksum();
+ }
+
+ bool Archive::check() const
+ {
+ return m_impl->verify();
+ }
+
+ bool Archive::isMultiPart() const
+ {
+ return m_impl->is_multiPart();
+ }
+
+ bool Archive::hasNewNamespaceScheme() const
+ {
+ return m_impl->hasNewNamespaceScheme();
+ }
+
+ cluster_index_type Archive::getClusterCount() const
+ {
+ return cluster_index_type(m_impl->getCountClusters());
+ }
+
+ offset_type Archive::getClusterOffset(cluster_index_type idx) const
+ {
+ return offset_type(m_impl->getClusterOffset(cluster_index_t(idx)));
+ }
+
+ entry_index_type Archive::getMainEntryIndex() const
+ {
+ return m_impl->getFileheader().getMainPage();
+ }
+
+ template<>
+ entry_index_type
+ _toPathOrder<EntryOrder::pathOrder>(const FileImpl& impl, entry_index_type idx)
+ {
+ return idx;
+ }
+
+ template<>
+ entry_index_type
+ _toPathOrder<EntryOrder::titleOrder>(const FileImpl& impl, entry_index_type idx)
+ {
+ return impl.getIndexByTitle(title_index_t(idx)).v;
+ }
+
+ template<>
+ entry_index_type
+ _toPathOrder<EntryOrder::efficientOrder>(const FileImpl& impl, entry_index_type idx)
+ {
+ return impl.getIndexByClusterOrder(entry_index_t(idx)).v;
+ }
+
+ bool Archive::checkIntegrity(IntegrityCheck checkType)
+ {
+ return m_impl->checkIntegrity(checkType);
+ }
+
+ bool validate(const std::string& zimPath, IntegrityCheckList checksToRun)
+ {
+ try
+ {
+ Archive a(zimPath);
+ for ( size_t i = 0; i < checksToRun.size(); ++i )
+ {
+ if ( checksToRun.test(i) && !a.checkIntegrity(IntegrityCheck(i)) )
+ return false;
+ }
+ }
+ catch(ZimFileFormatError &exception)
+ {
+ std::cerr << exception.what() << std::endl;
+ return false;
+ }
+
+ return true;
+ }
+
+} // namespace zim
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ * Copyright (C) 2017-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+
+#include "zim/blob.h"
+#include "debug.h"
+#include "buffer.h"
+
+namespace zim {
+
+namespace
+{
+
+struct NoDelete
+{
+ template<class T> void operator()(T*) {}
+};
+
+// This shared_ptr is used as a source object for the std::shared_ptr
+// aliasing constructor (with the purpose of avoiding the control block
+// allocation) for the case when the referred data must not be deleted.
+static Blob::DataPtr nonOwnedDataPtr((char*)nullptr, NoDelete());
+
+} // unnamed namespace
+
+
+Blob::Blob()
+ : _data(nonOwnedDataPtr),
+ _size(0)
+{}
+
+Blob::Blob(const char* data, size_type size)
+ : _data(nonOwnedDataPtr, data),
+ _size(size)
+{
+ ASSERT(size, <, SIZE_MAX);
+ ASSERT(data, <, (void*)(SIZE_MAX-size));
+}
+
+Blob::Blob(const DataPtr& buffer, size_type size)
+ : _data(buffer),
+ _size(size)
+{}
+
+
+
+
+} //zim
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ * Copyright (C) 2017-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "buffer.h"
+
+#include <sys/stat.h>
+#include <cstdio>
+#include <cstdlib>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <sstream>
+
+#ifndef _WIN32
+# include <sys/mman.h>
+# include <unistd.h>
+#endif
+
+namespace zim {
+
+namespace
+{
+
+struct NoDelete
+{
+ template<class T> void operator()(T*) {}
+};
+
+// This shared_ptr is used as a source object for the std::shared_ptr
+// aliasing constructor (with the purpose of avoiding the control block
+// allocation) for the case when the referred data must not be deleted.
+static Buffer::DataPtr nonOwnedDataPtr((char*)nullptr, NoDelete());
+
+} // unnamed namespace
+
+const Buffer Buffer::sub_buffer(offset_t offset, zsize_t size) const
+{
+ ASSERT(offset.v, <=, m_size.v);
+ ASSERT(offset.v+size.v, <=, m_size.v);
+ auto sub_data = DataPtr(m_data, data(offset));
+ return Buffer(sub_data, size);
+}
+
+const Buffer Buffer::makeBuffer(const DataPtr& data, zsize_t size)
+{
+ return Buffer(data, size);
+}
+
+const Buffer Buffer::makeBuffer(const char* data, zsize_t size)
+{
+ return Buffer(DataPtr(nonOwnedDataPtr, data), size);
+}
+
+Buffer Buffer::makeBuffer(zsize_t size)
+{
+ if (0 == size.v) {
+ return Buffer(DataPtr(nonOwnedDataPtr, nullptr), size);
+ }
+ return Buffer(DataPtr(new char[size.v], std::default_delete<char[]>()), size);
+}
+
+Buffer::Buffer(const DataPtr& data, zsize_t size)
+ : m_size(size),
+ m_data(data)
+{
+ ASSERT(m_size.v, <, SIZE_MAX);
+}
+
+const char*
+Buffer::data(offset_t offset) const {
+ ASSERT(offset.v, <=, m_size.v);
+ return m_data.get() + offset.v;
+}
+
+} //zim
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ * Copyright (C) 2017-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_BUFFER_H_
+#define ZIM_BUFFER_H_
+
+#include <cstddef>
+#include <exception>
+#include <memory>
+#include <iostream>
+
+#include "config.h"
+#include "zim_types.h"
+#include "endian_tools.h"
+#include "debug.h"
+#include <zim/blob.h>
+
+namespace zim {
+
+class Buffer {
+ public: // types
+ typedef std::shared_ptr<const char> DataPtr;
+
+ public: // functions
+ static const Buffer makeBuffer(const char* data, zsize_t size);
+ static const Buffer makeBuffer(const DataPtr& data, zsize_t size);
+ static Buffer makeBuffer(zsize_t size);
+
+ const char* data(offset_t offset=offset_t(0)) const;
+
+ char at(offset_t offset) const {
+ return *(data(offset));
+ }
+ zsize_t size() const { return m_size; }
+ const Buffer sub_buffer(offset_t offset, zsize_t size) const;
+ operator Blob() const { return Blob(m_data, m_size.v); }
+
+ private: // functions
+ Buffer(const DataPtr& data, zsize_t size);
+
+ private: // data
+ zsize_t m_size;
+ DataPtr m_data;
+};
+
+} // zim namespace
+
+#endif //ZIM_BUFFER_H_
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/zim.h>
+#include <zim/error.h>
+#include "buffer_reader.h"
+#include "buffer.h"
+
+#include <cstring>
+
+namespace zim {
+
+const Buffer BufferReader::get_buffer(offset_t offset, zsize_t size) const
+{
+ return source.sub_buffer(offset, size);
+}
+
+std::unique_ptr<const Reader> BufferReader::sub_reader(offset_t offset, zsize_t size) const
+{
+ auto sub_buff = get_buffer(offset, size);
+ std::unique_ptr<const Reader> sub_read(new BufferReader(sub_buff));
+ return sub_read;
+}
+
+zsize_t BufferReader::size() const
+{
+ return source.size();
+}
+
+offset_t BufferReader::offset() const
+{
+ return offset_t((offset_type)(static_cast<const void*>(source.data(offset_t(0)))));
+}
+
+
+void BufferReader::read(char* dest, offset_t offset, zsize_t size) const {
+ ASSERT(offset.v, <=, source.size().v);
+ ASSERT(offset+offset_t(size.v), <=, offset_t(source.size().v));
+ if (! size ) {
+ return;
+ }
+ memcpy(dest, source.data(offset), size.v);
+}
+
+
+char BufferReader::read(offset_t offset) const {
+ ASSERT(offset.v, <, source.size().v);
+ char dest;
+ dest = *source.data(offset);
+ return dest;
+}
+
+
+} // zim
--- /dev/null
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_BUFFER_READER_H_
+#define ZIM_BUFFER_READER_H_
+
+#include "reader.h"
+
+namespace zim {
+
+class BufferReader : public Reader {
+ public:
+ BufferReader(const Buffer& source)
+ : source(source) {}
+ virtual ~BufferReader() {};
+
+ zsize_t size() const;
+ offset_t offset() const;
+
+ void read(char* dest, offset_t offset, zsize_t size) const;
+ char read(offset_t offset) const;
+ const Buffer get_buffer(offset_t offset, zsize_t size) const;
+ std::unique_ptr<const Reader> sub_reader(offset_t offset, zsize_t size) const;
+
+ private:
+ const Buffer source;
+};
+
+};
+
+#endif // ZIM_BUFFER_READER_H_
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_BUFFERSTREAMER_H
+#define ZIM_BUFFERSTREAMER_H
+
+#include "debug.h"
+
+#include <string.h>
+
+namespace zim
+{
+
+class BufferStreamer
+{
+public: // functions
+ BufferStreamer(const Buffer& buffer, zsize_t size)
+ : m_buffer(buffer),
+ m_current(buffer.data()),
+ m_size(size)
+ {}
+
+ explicit BufferStreamer(const Buffer& buffer)
+ : BufferStreamer(buffer, buffer.size())
+ {}
+
+ // Reads a value of the said type from the stream
+ //
+ // For best portability this function should be used with types of known
+ // bit-width (int32_t, uint16_t, etc) rather than builtin types with
+ // unknown bit-width (int, unsigned, etc).
+ template<typename T> T read()
+ {
+ const size_t N(sizeof(T));
+ char buf[N];
+ memcpy(buf, m_current, N);
+ skip(zsize_t(N));
+ return fromLittleEndian<T>(buf); // XXX: This handles only integral types
+ }
+
+ const char* current() const {
+ return m_current;
+ }
+
+ zsize_t left() const {
+ return m_size;
+ }
+
+ void skip(zsize_t nbBytes) {
+ m_current += nbBytes.v;
+ m_size -= nbBytes;
+ }
+
+private: // data
+ const Buffer m_buffer;
+ const char* m_current;
+ zsize_t m_size;
+};
+
+} // namespace zim
+
+#endif // ZIM_BUFDATASTREAM_H
--- /dev/null
+/*
+ * Copyright (C) 2016-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "cluster.h"
+#include <zim/blob.h>
+#include <zim/error.h>
+#include "buffer_reader.h"
+#include "endian_tools.h"
+#include "bufferstreamer.h"
+#include "decoderstreamreader.h"
+#include "rawstreamreader.h"
+#include <algorithm>
+#include <stdlib.h>
+#include <sstream>
+
+#include "compression.h"
+#include "log.h"
+
+#include "config.h"
+
+log_define("zim.cluster")
+
+#define log_debug1(e)
+
+namespace zim
+{
+
+namespace
+{
+
+std::unique_ptr<IStreamReader>
+getClusterReader(const Reader& zimReader, offset_t offset, Compression* comp, bool* extended)
+{
+ uint8_t clusterInfo = zimReader.read(offset);
+ // Very old zim files used 0 as a "default" compression, which means no compression.
+ uint8_t compInfo = clusterInfo & 0x0F;
+ if(compInfo == 0) {
+ *comp = Compression::None;
+ } else if (compInfo == 2 /* Zip compression */) {
+ throw std::runtime_error("zlib not enabled in this library");
+ } else if (compInfo == 3 /* Bzip2 compression */) {
+ throw std::runtime_error("bzip2 not enabled in this library");
+ } else {
+ *comp = static_cast<Compression>(compInfo);
+ }
+ *extended = clusterInfo & 0x10;
+ auto subReader = std::shared_ptr<const Reader>(zimReader.sub_reader(offset+offset_t(1)));
+
+ switch (*comp) {
+ case Compression::None:
+ return std::unique_ptr<IStreamReader>(new RawStreamReader(subReader));
+ case Compression::Lzma:
+ return std::unique_ptr<IStreamReader>(new DecoderStreamReader<LZMA_INFO>(subReader));
+ case Compression::Zstd:
+ return std::unique_ptr<IStreamReader>(new DecoderStreamReader<ZSTD_INFO>(subReader));
+ default:
+ throw ZimFileFormatError("Invalid compression flag");
+ }
+}
+
+} // unnamed namespace
+
+ std::shared_ptr<Cluster> Cluster::read(const Reader& zimReader, offset_t clusterOffset)
+ {
+ Compression comp;
+ bool extended;
+ auto reader = getClusterReader(zimReader, clusterOffset, &comp, &extended);
+ return std::make_shared<Cluster>(std::move(reader), comp, extended);
+ }
+
+ Cluster::Cluster(std::unique_ptr<IStreamReader> reader_, Compression comp, bool isExtended)
+ : compression(comp),
+ isExtended(isExtended),
+ m_reader(std::move(reader_))
+ {
+ if (isExtended) {
+ read_header<uint64_t>();
+ } else {
+ read_header<uint32_t>();
+ }
+ }
+
+ /* This return the number of char read */
+ template<typename OFFSET_TYPE>
+ void Cluster::read_header()
+ {
+ // read first offset, which specifies, how many offsets we need to read
+ OFFSET_TYPE offset = m_reader->read<OFFSET_TYPE>();
+
+ size_t n_offset = offset / sizeof(OFFSET_TYPE);
+ const offset_t data_address(offset);
+
+ // read offsets
+ m_blobOffsets.clear();
+ m_blobOffsets.reserve(n_offset);
+ m_blobOffsets.push_back(offset_t(offset));
+
+ // Get the whole offsets data to avoid to many (system) call.
+ auto bufferSize = zsize_t(offset-sizeof(OFFSET_TYPE));
+ auto buffer = m_reader->sub_reader(bufferSize)->get_buffer(offset_t(0), bufferSize);
+ auto seqReader = BufferStreamer(buffer, bufferSize);
+ while (--n_offset)
+ {
+ OFFSET_TYPE new_offset = seqReader.read<OFFSET_TYPE>();
+ ASSERT(new_offset, >=, offset);
+
+ m_blobOffsets.push_back(offset_t(new_offset));
+ offset = new_offset;
+ }
+ }
+
+ zsize_t Cluster::getBlobSize(blob_index_t n) const
+ {
+ if (blob_index_type(n)+1 >= m_blobOffsets.size()) {
+ throw ZimFileFormatError("blob index out of range");
+ }
+ return zsize_t(m_blobOffsets[blob_index_type(n)+1].v - m_blobOffsets[blob_index_type(n)].v);
+ }
+
+ const Reader& Cluster::getReader(blob_index_t n) const
+ {
+ std::lock_guard<std::mutex> lock(m_readerAccessMutex);
+ for(blob_index_type current(m_blobReaders.size()); current<=n.v; ++current) {
+ auto blobSize = getBlobSize(blob_index_t(current));
+ if (blobSize.v > SIZE_MAX) {
+ m_blobReaders.push_back(std::unique_ptr<Reader>(new BufferReader(Buffer::makeBuffer(zsize_t(0)))));
+ } else {
+ m_blobReaders.push_back(m_reader->sub_reader(blobSize));
+ }
+ }
+ return *m_blobReaders[blob_index_type(n)];
+ }
+
+ Blob Cluster::getBlob(blob_index_t n) const
+ {
+ if (n < count()) {
+ const auto blobSize = getBlobSize(n);
+ if (blobSize.v > SIZE_MAX) {
+ return Blob();
+ }
+ return getReader(n).get_buffer(offset_t(0), blobSize);
+ } else {
+ return Blob();
+ }
+ }
+
+ Blob Cluster::getBlob(blob_index_t n, offset_t offset, zsize_t size) const
+ {
+ if (n < count()) {
+ const auto blobSize = getBlobSize(n);
+ if ( offset.v > blobSize.v ) {
+ return Blob();
+ }
+ size = std::min(size, zsize_t(blobSize.v-offset.v));
+ if (size.v > SIZE_MAX) {
+ return Blob();
+ }
+ return getReader(n).get_buffer(offset, size);
+ } else {
+ return Blob();
+ }
+ }
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2016-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ * Copyright (C) 2020 Miguel Rocha
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_CLUSTER_H
+#define ZIM_CLUSTER_H
+
+#include <zim/zim.h>
+#include "buffer.h"
+#include "zim_types.h"
+#include "file_reader.h"
+#include <iosfwd>
+#include <vector>
+#include <memory>
+#include <mutex>
+
+#include "zim_types.h"
+#include "zim/error.h"
+
+namespace zim
+{
+ class Blob;
+ class Reader;
+ class IStreamReader;
+
+ class Cluster : public std::enable_shared_from_this<Cluster> {
+ typedef std::vector<offset_t> BlobOffsets;
+ typedef std::vector<std::unique_ptr<const Reader>> BlobReaders;
+
+ public:
+ const Compression compression;
+ const bool isExtended;
+
+ private:
+ std::unique_ptr<IStreamReader> m_reader;
+
+ // offsets of the blob boundaries relative to the start of the cluster data
+ // (*after* the first byte (clusterInfo))
+ // For a cluster with N blobs, this collection contains N+1 entries.
+ // The start of the first blob and the end of the last blob are included.
+ BlobOffsets m_blobOffsets;
+
+ mutable std::mutex m_readerAccessMutex;
+ mutable BlobReaders m_blobReaders;
+
+
+ template<typename OFFSET_TYPE>
+ void read_header();
+ const Reader& getReader(blob_index_t n) const;
+
+ public:
+ Cluster(std::unique_ptr<IStreamReader> reader, Compression comp, bool isExtended);
+ Compression getCompression() const { return compression; }
+ bool isCompressed() const { return compression != Compression::None; }
+
+ blob_index_t count() const { return blob_index_t(m_blobOffsets.size() - 1); }
+
+ zsize_t getBlobSize(blob_index_t n) const;
+
+ offset_t getBlobOffset(blob_index_t n) const { return offset_t(1) + m_blobOffsets[blob_index_type(n)]; }
+ Blob getBlob(blob_index_t n) const;
+ Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const;
+
+ static std::shared_ptr<Cluster> read(const Reader& zimReader, offset_t clusterOffset);
+ };
+
+}
+
+#endif // ZIM_CLUSTER_H
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Emmanuel Engelhart <kelson@kiwix.org>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the impliedD
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "compression.h"
+
+#include "envvalue.h"
+
+#include <stdexcept>
+
+const std::string LZMA_INFO::name = "lzma";
+void LZMA_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
+{
+ *stream = LZMA_STREAM_INIT;
+ unsigned memsize = zim::envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024);
+ auto errcode = lzma_stream_decoder(stream, memsize, 0);
+ if (errcode != LZMA_OK) {
+ throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream");
+ }
+}
+
+void LZMA_INFO::init_stream_encoder(stream_t* stream, char* raw_data)
+{
+ *stream = LZMA_STREAM_INIT;
+ auto errcode = lzma_easy_encoder(stream, 9 | LZMA_PRESET_EXTREME, LZMA_CHECK_CRC32);
+ if (errcode != LZMA_OK) {
+ throw std::runtime_error("Cannot initialize lzma_easy_encoder");
+ }
+}
+
+CompStatus LZMA_INFO::stream_run_encode(stream_t* stream, CompStep step) {
+ return stream_run(stream, step);
+}
+
+CompStatus LZMA_INFO::stream_run_decode(stream_t* stream, CompStep step) {
+ return stream_run(stream, step);
+}
+
+CompStatus LZMA_INFO::stream_run(stream_t* stream, CompStep step)
+{
+ auto errcode = lzma_code(stream, step==CompStep::STEP?LZMA_RUN:LZMA_FINISH);
+ switch(errcode) {
+ case LZMA_BUF_ERROR:
+ return CompStatus::BUF_ERROR;
+ case LZMA_STREAM_END:
+ return CompStatus::STREAM_END;
+ case LZMA_OK:
+ return CompStatus::OK;
+ default: {
+ std::ostringstream ss;
+ ss << "Unexpected lzma status : " << errcode;
+ throw std::runtime_error(ss.str());
+ }
+ }
+}
+
+void LZMA_INFO::stream_end_decode(stream_t* stream)
+{
+ lzma_end(stream);
+}
+
+void LZMA_INFO::stream_end_encode(stream_t* stream)
+{
+ lzma_end(stream);
+}
+
+
+const std::string ZSTD_INFO::name = "zstd";
+
+ZSTD_INFO::stream_t::stream_t()
+: next_in(nullptr),
+ avail_in(0),
+ next_out(nullptr),
+ avail_out(0),
+ total_out(0),
+ encoder_stream(nullptr),
+ decoder_stream(nullptr)
+{}
+
+ZSTD_INFO::stream_t::~stream_t()
+{
+ if ( encoder_stream )
+ ::ZSTD_freeCStream(encoder_stream);
+
+ if ( decoder_stream )
+ ::ZSTD_freeDStream(decoder_stream);
+}
+
+void ZSTD_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
+{
+ stream->decoder_stream = ::ZSTD_createDStream();
+ auto ret = ::ZSTD_initDStream(stream->decoder_stream);
+ if (::ZSTD_isError(ret)) {
+ throw std::runtime_error("Failed to initialize Zstd decompression");
+ }
+}
+
+void ZSTD_INFO::init_stream_encoder(stream_t* stream, char* raw_data)
+{
+ stream->encoder_stream = ::ZSTD_createCStream();
+ auto ret = ::ZSTD_initCStream(stream->encoder_stream, 19);
+ if (::ZSTD_isError(ret)) {
+ throw std::runtime_error("Failed to initialize Zstd compression");
+ }
+}
+
+CompStatus ZSTD_INFO::stream_run_encode(stream_t* stream, CompStep step) {
+ ::ZSTD_inBuffer inBuf;
+ inBuf.src = stream->next_in;
+ inBuf.size = stream->avail_in;
+ inBuf.pos = 0;
+
+ ::ZSTD_outBuffer outBuf;
+ outBuf.dst = stream->next_out;
+ outBuf.size = stream->avail_out;
+ outBuf.pos = 0;
+
+ auto ret = step == CompStep::STEP
+ ? ::ZSTD_compressStream(stream->encoder_stream, &outBuf, &inBuf)
+ : ::ZSTD_endStream(stream->encoder_stream, &outBuf);
+ stream->next_in += inBuf.pos;
+ stream->avail_in -= inBuf.pos;
+ stream->next_out += outBuf.pos;
+ stream->avail_out -= outBuf.pos;
+ stream->total_out += outBuf.pos;
+
+ if (::ZSTD_isError(ret)) {
+ throw std::runtime_error(::ZSTD_getErrorName(ret));
+ }
+
+ if ( step == CompStep::STEP ) {
+ if ( stream->avail_in != 0) {
+ ASSERT(stream->avail_out, ==, 0u);
+ return CompStatus::BUF_ERROR;
+ }
+ } else if ( ret > 0 ) {
+ return CompStatus::BUF_ERROR;
+ }
+
+ return CompStatus::OK;
+}
+
+CompStatus ZSTD_INFO::stream_run_decode(stream_t* stream, CompStep /*step*/) {
+ ::ZSTD_inBuffer inBuf;
+ inBuf.src = stream->next_in;
+ inBuf.size = stream->avail_in;
+ inBuf.pos = 0;
+
+ ::ZSTD_outBuffer outBuf;
+ outBuf.dst = stream->next_out;
+ outBuf.size = stream->avail_out;
+ outBuf.pos = 0;
+
+ auto ret = ::ZSTD_decompressStream(stream->decoder_stream, &outBuf, &inBuf);
+ stream->next_in += inBuf.pos;
+ stream->avail_in -= inBuf.pos;
+ stream->next_out += outBuf.pos;
+ stream->avail_out -= outBuf.pos;
+ stream->total_out += outBuf.pos;
+
+ if (::ZSTD_isError(ret))
+ throw std::runtime_error(::ZSTD_getErrorName(ret));
+
+ if (ret == 0)
+ return CompStatus::STREAM_END;
+
+ return CompStatus::BUF_ERROR;
+}
+
+void ZSTD_INFO::stream_end_decode(stream_t* stream)
+{
+}
+
+void ZSTD_INFO::stream_end_encode(stream_t* stream)
+{
+}
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Emmanuel Engelhart <kelson@kiwix.org>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef _LIBZIM_COMPRESSION_
+#define _LIBZIM_COMPRESSION_
+
+#include <vector>
+#include "string.h"
+
+#include "file_reader.h"
+#include <zim/error.h>
+
+#include "config.h"
+
+#include <lzma.h>
+#include <zstd.h>
+
+#include "zim_types.h"
+#include "constants.h"
+
+//#define DEB(X) std::cerr << __func__ << " " << X << std::endl ;
+#define DEB(X)
+
+enum class CompStep {
+ STEP,
+ FINISH
+};
+
+enum class CompStatus {
+ OK,
+ STREAM_END,
+ BUF_ERROR,
+};
+
+enum class RunnerStatus {
+ OK,
+ NEED_MORE,
+ ERROR
+};
+
+struct LZMA_INFO {
+ typedef lzma_stream stream_t;
+ static const std::string name;
+ static void init_stream_decoder(stream_t* stream, char* raw_data);
+ static void init_stream_encoder(stream_t* stream, char* raw_data);
+ static CompStatus stream_run_encode(stream_t* stream, CompStep step);
+ static CompStatus stream_run_decode(stream_t* stream, CompStep step);
+ static CompStatus stream_run(stream_t* stream, CompStep step);
+ static void stream_end_encode(stream_t* stream);
+ static void stream_end_decode(stream_t* stream);
+};
+
+
+struct ZSTD_INFO {
+ struct stream_t
+ {
+ const unsigned char* next_in;
+ size_t avail_in;
+ unsigned char* next_out;
+ size_t avail_out;
+ size_t total_out;
+
+ ::ZSTD_CStream* encoder_stream;
+ ::ZSTD_DStream* decoder_stream;
+
+ stream_t();
+ ~stream_t();
+ private:
+ stream_t(const stream_t& t) = delete;
+ void operator=(const stream_t& t) = delete;
+ };
+
+ static const std::string name;
+ static void init_stream_decoder(stream_t* stream, char* raw_data);
+ static void init_stream_encoder(stream_t* stream, char* raw_data);
+ static CompStatus stream_run_encode(stream_t* stream, CompStep step);
+ static CompStatus stream_run_decode(stream_t* stream, CompStep step);
+ static void stream_end_encode(stream_t* stream);
+ static void stream_end_decode(stream_t* stream);
+};
+
+
+namespace zim {
+
+template<typename INFO>
+class Uncompressor
+{
+ public:
+ Uncompressor(size_t initial_size) :
+ ret_data(new char[initial_size]),
+ data_size(initial_size)
+ {}
+ ~Uncompressor() = default;
+
+ void init(char* data) {
+ INFO::init_stream_decoder(&stream, data);
+ stream.next_out = (uint8_t*)ret_data.get();
+ stream.avail_out = data_size;
+ }
+
+ RunnerStatus feed(char* data, size_t size, CompStep step = CompStep::STEP) {
+ stream.next_in = (unsigned char*)data;
+ stream.avail_in = size;
+ while (true) {
+ auto errcode = INFO::stream_run_decode(&stream, step);
+ DEB((int)errcode)
+ switch (errcode) {
+ case CompStatus::BUF_ERROR:
+ if (stream.avail_in == 0 && stream.avail_out != 0) {
+ // End of input stream.
+ // compressor hasn't recognize the end of the input stream but there is
+ // no more input.
+ return RunnerStatus::NEED_MORE;
+ } else {
+ // Not enought output size.
+ // Allocate more memory and continue the loop.
+ DEB("need memory " << data_size << " " << stream.avail_out << " " << stream.total_out)
+ data_size *= 2;
+ std::unique_ptr<char[]> new_ret_data(new char[data_size]);
+ memcpy(new_ret_data.get(), ret_data.get(), stream.total_out);
+ stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out);
+ stream.avail_out = data_size - stream.total_out;
+ DEB(data_size << " " << stream.avail_out << " " << stream.avail_in)
+ ret_data = std::move(new_ret_data);
+ }
+ break;
+ case CompStatus::OK:
+ // On first call where lzma cannot progress (no output size).
+ // Lzma return OK. If we return NEED_MORE, then we will try to compress
+ // with new input data, but we should not as current one is not processed.
+ // We must do a second step to have te BUF_ERROR and handle thing correctly.
+ // If we have no more input, then we must ask for more.
+ if (stream.avail_in == 0) {
+ return RunnerStatus::NEED_MORE;
+ }
+ break;
+ case CompStatus::STREAM_END:
+ // End of compressed stream. Everything is ok.
+ return RunnerStatus::OK;
+ default:
+ // unreachable
+ return RunnerStatus::ERROR;
+ }
+ };
+ // unreachable
+ return RunnerStatus::NEED_MORE;
+ }
+
+ std::unique_ptr<char[]> get_data(zim::zsize_t* size) {
+ feed(nullptr, 0, CompStep::FINISH);
+ size->v = stream.total_out;
+ INFO::stream_end_decode(&stream);
+ return std::move(ret_data);
+ }
+
+ private:
+ std::unique_ptr<char[]> ret_data;
+ size_type data_size;
+ typename INFO::stream_t stream;
+};
+
+#define CHUNCK_SIZE ((zim::size_type)(1024))
+/**
+ * Uncompress data of the reader at startOffset.
+ *
+ * @param reader The reader where the data is.
+ * @param startOffset The offset where the data is in the reader.
+ * @param[out] dest_size The size of the uncompressed data.
+ * @return A pointer to the uncompressed data. This must be deleted (delete[])
+*/
+template<typename INFO>
+std::unique_ptr<char[]> uncompress(const zim::Reader* reader, zim::offset_t startOffset, zim::zsize_t* dest_size) {
+ // Use a compressor to compress the data.
+ // As we don't know the result size, neither the compressed size,
+ // we have to do chunk by chunk until decompressor is happy.
+ // Let's assume it will be something like the default clusterSize used at creation
+ Uncompressor<INFO> runner(DEFAULT_CLUSTER_SIZE);
+ // The input is a buffer of CHUNCK_SIZE char max. It may be less if the last chunk
+ // is at the end of the reader and the reader size is not a multiple of CHUNCK_SIZE.
+ std::vector<char> raw_data(CHUNCK_SIZE);
+
+ DEB("Init")
+ runner.init(raw_data.data());
+
+ zim::size_type availableSize = reader->size().v - startOffset.v;
+ auto ret = RunnerStatus::NEED_MORE;
+ while(ret != RunnerStatus::OK) {
+ if (ret == RunnerStatus::NEED_MORE and availableSize) {
+ zim::size_type inputSize = std::min(availableSize, CHUNCK_SIZE);
+ reader->read(raw_data.data(), startOffset, zim::zsize_t(inputSize));
+ startOffset.v += inputSize;
+ availableSize -= inputSize;
+ DEB("Step " << startOffset.v)
+ ret = runner.feed(raw_data.data(), inputSize);
+ DEB("Ret " << (int)ret)
+ }
+ if (ret == RunnerStatus::ERROR) {
+ throw zim::ZimFileFormatError(std::string("Invalid ") + INFO::name
+ + std::string(" stream for cluster."));
+ }
+ }
+
+ DEB("Finish")
+ return runner.get_data(dest_size);
+}
+
+template<typename INFO>
+class Compressor
+{
+ public:
+ Compressor(size_t initial_size=1024*1024) :
+ ret_data(new char[initial_size]),
+ ret_size(initial_size)
+ {}
+
+ ~Compressor() = default;
+
+ void init(char* data) {
+ INFO::init_stream_encoder(&stream, data);
+ stream.next_out = (uint8_t*)ret_data.get();
+ stream.avail_out = ret_size;
+ }
+
+ RunnerStatus feed(const char* data, size_t size, CompStep step=CompStep::STEP) {
+ stream.next_in = (unsigned char*)data;
+ stream.avail_in = size;
+ while (true) {
+ auto errcode = INFO::stream_run_encode(&stream, step);
+ switch (errcode) {
+ case CompStatus::OK:
+ if (stream.avail_out == 0) {
+ // lzma return a OK return status the first time it runs out of output memory.
+ // The BUF_ERROR is returned only the second time we call a lzma_code.
+ continue;
+ } else {
+ return RunnerStatus::NEED_MORE;
+ }
+ case CompStatus::STREAM_END:
+ return RunnerStatus::NEED_MORE;
+ case CompStatus::BUF_ERROR:
+ if (stream.avail_out == 0) {
+ //Not enought output size
+ ret_size *= 2;
+ std::unique_ptr<char[]> new_ret_data(new char[ret_size]);
+ memcpy(new_ret_data.get(), ret_data.get(), stream.total_out);
+ stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out);
+ stream.avail_out = ret_size - stream.total_out;
+ ret_data = std::move(new_ret_data);
+ continue;
+ } else {
+ return RunnerStatus::ERROR;
+ }
+ break;
+ default:
+ // unreachable
+ return RunnerStatus::ERROR;
+ };
+ };
+ // urreachable
+ return RunnerStatus::NEED_MORE;
+ }
+
+ std::unique_ptr<char[]> get_data(zim::zsize_t* size) {
+ feed(nullptr, 0, CompStep::FINISH);
+ INFO::stream_end_encode(&stream);
+ size->v = stream.total_out;
+ return std::move(ret_data);
+ }
+
+ private:
+ std::unique_ptr<char[]> ret_data;
+ size_t ret_size;
+ typename INFO::stream_t stream;
+};
+
+} // namespace zim
+
+#endif // _LIBZIM_COMPRESSION_
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_CONCURRENT_CACHE_H
+#define ZIM_CONCURRENT_CACHE_H
+
+#include "lrucache.h"
+
+#include <future>
+#include <mutex>
+
+namespace zim
+{
+
+/**
+ ConcurrentCache implements a concurrent thread-safe cache
+
+ Compared to zim::lru_cache, each access operation is slightly more expensive.
+ However, different slots of the cache can be safely accessed concurrently
+ with minimal blocking. Concurrent access to the same element is also
+ safe, and, in case of a cache miss, will block until that element becomes
+ available.
+ */
+template <typename Key, typename Value>
+class ConcurrentCache
+{
+private: // types
+ typedef std::shared_future<Value> ValuePlaceholder;
+ typedef lru_cache<Key, ValuePlaceholder> Impl;
+
+public: // types
+ explicit ConcurrentCache(size_t maxEntries)
+ : impl_(maxEntries)
+ {}
+
+ // Gets the entry corresponding to the given key. If the entry is not in the
+ // cache, it is obtained by calling f() (without any arguments) and the
+ // result is put into the cache.
+ //
+ // The cache as a whole is locked only for the duration of accessing
+ // the respective slot. If, in the case of the a cache miss, the generation
+ // of the missing element takes a long time, only attempts to access that
+ // element will block - the rest of the cache remains open to concurrent
+ // access.
+ template<class F>
+ Value getOrPut(const Key& key, F f)
+ {
+ std::promise<Value> valuePromise;
+ std::unique_lock<std::mutex> l(lock_);
+ const auto x = impl_.getOrPut(key, valuePromise.get_future().share());
+ l.unlock();
+ if ( x.miss() ) {
+ try {
+ valuePromise.set_value(f());
+ } catch (std::exception& e) {
+ drop(key);
+ throw;
+ }
+ }
+
+ return x.value().get();
+ }
+
+ bool drop(const Key& key)
+ {
+ std::unique_lock<std::mutex> l(lock_);
+ return impl_.drop(key);
+ }
+
+private: // data
+ Impl impl_;
+ std::mutex lock_;
+};
+
+} // namespace zim
+
+#endif // ZIM_CONCURRENT_CACHE_H
+
--- /dev/null
+
+#mesondefine VERSION
+
+#mesondefine DIRENT_CACHE_SIZE
+
+#mesondefine DIRENT_LOOKUP_CACHE_SIZE
+
+#mesondefine CLUSTER_CACHE_SIZE
+
+#mesondefine LZMA_MEMORY_SIZE
+
+#mesondefine ENABLE_XAPIAN
+
+#mesondefine ENABLE_USE_MMAP
+
+#mesondefine ENABLE_USE_BUFFER_HEADER
+
+#mesondefine MMAP_SUPPORT_64
+
+#mesondefine ENV64BIT
+
+#mesondefine ENV32BIT
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ANCHOR_TERM "0posanchor "
+
+#define DEFAULT_CLUSTER_SIZE 2*1024*1024
--- /dev/null
+/*
+ * Copyright (C) 2017-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef DEBUG_H_
+#define DEBUG_H_
+
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#include <stdlib.h>
+
+#if defined (NDEBUG)
+# define ASSERT(left, operator, right) (void(0))
+#else
+
+#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__EMSCRIPTEN__)
+#include <execinfo.h>
+#endif
+
+template<typename T, typename U>
+void _on_assert_fail(const char* vara, const char* op, const char* varb,
+ T a, U b, const char* file, int line) {
+ std::ostringstream ss;
+ ss << "\nAssertion failed at "<< file << ":" << line << "\n " <<
+ vara << "[" << a << "] " << op << " " << varb << "[" << b << "]";
+ std::cerr << ss.str() << std::endl;
+
+#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__) && !defined(__EMSCRIPTEN__) && defined(__GNU_LIBRARY__)
+ void *callstack[64];
+ size_t size;
+ size = backtrace(callstack, 64);
+ char** strings = backtrace_symbols(callstack, size);
+ for (size_t i=0; i<size; i++) {
+ std::cerr << strings[i] << std::endl;
+ }
+ free(strings);
+#endif
+ throw std::runtime_error(ss.str());
+}
+
+# define ASSERT(left, operator, right) do { auto _left = left; auto _right = right; if (!((_left) operator (_right))) _on_assert_fail(#left, #operator, #right, _left, _right, __FILE__, __LINE__); } while(0)
+
+#endif
+
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_DECODERSTREAMREADER_H
+#define ZIM_DECODERSTREAMREADER_H
+
+#include "compression.h"
+#include "istreamreader.h"
+
+namespace zim
+{
+
+template<typename Decoder>
+class DecoderStreamReader : public IStreamReader
+{
+private: // constants
+ enum { CHUNK_SIZE = 1024 };
+
+public: // functions
+ DecoderStreamReader(std::shared_ptr<const Reader> inputReader)
+ : m_encodedDataReader(inputReader),
+ m_currentInputOffset(0),
+ m_inputBytesLeft(inputReader->size()),
+ m_encodedDataChunk(Buffer::makeBuffer(zsize_t(CHUNK_SIZE)))
+ {
+ Decoder::init_stream_decoder(&m_decoderState, nullptr);
+ readNextChunk();
+ }
+
+ ~DecoderStreamReader()
+ {
+ Decoder::stream_end_decode(&m_decoderState);
+ }
+
+private: // functions
+ void readNextChunk()
+ {
+ const auto n = std::min(zsize_t(CHUNK_SIZE), m_inputBytesLeft);
+ m_encodedDataChunk = m_encodedDataReader->get_buffer(m_currentInputOffset, n);
+ m_currentInputOffset += n;
+ m_inputBytesLeft -= n;
+ // XXX: ugly C-style cast (casting away constness) on the next line
+ m_decoderState.next_in = (unsigned char*)m_encodedDataChunk.data();
+ m_decoderState.avail_in = m_encodedDataChunk.size().v;
+ }
+
+ CompStatus decodeMoreBytes()
+ {
+ CompStep step = CompStep::STEP;
+ if ( m_decoderState.avail_in == 0 )
+ {
+ if ( m_inputBytesLeft.v == 0 )
+ step = CompStep::FINISH;
+ else
+ readNextChunk();
+ }
+
+ return Decoder::stream_run_decode(&m_decoderState, step);
+ }
+
+ void readImpl(char* buf, zsize_t nbytes) override
+ {
+ m_decoderState.next_out = (unsigned char*)buf;
+ m_decoderState.avail_out = nbytes.v;
+ while ( m_decoderState.avail_out != 0 )
+ {
+ // We don't car of the return code of decodeMoreBytes.
+ // We feed (or stop feeding) the decoder based on what
+ // we need to decode and the `avail_in`.
+ // If there is a error somehow, a exception will be thrown.
+ decodeMoreBytes();
+ }
+ }
+
+private: // types
+ typedef typename Decoder::stream_t DecoderState;
+
+private: // data
+ std::shared_ptr<const Reader> m_encodedDataReader;
+ offset_t m_currentInputOffset;
+ zsize_t m_inputBytesLeft; // count of bytes left in the input stream
+ DecoderState m_decoderState;
+ Buffer m_encodedDataChunk;
+};
+
+} // namespace zim
+
+#endif // ZIM_DECODERSTREAMREADER_H
--- /dev/null
+/*
+ * Copyright (C) 2017-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "_dirent.h"
+#include "direntreader.h"
+#include <zim/zim.h>
+#include <zim/error.h>
+#include "buffer.h"
+#include "bufferstreamer.h"
+#include "endian_tools.h"
+#include "log.h"
+#include <algorithm>
+#include <cstring>
+
+log_define("zim.dirent")
+
+namespace zim
+{
+ //////////////////////////////////////////////////////////////////////
+ // Dirent
+ //
+
+ const uint16_t Dirent::redirectMimeType;
+ const uint16_t Dirent::linktargetMimeType;
+ const uint16_t Dirent::deletedMimeType;
+
+ bool DirentReader::initDirent(Dirent& dirent, const Buffer& direntData) const
+ {
+ BufferStreamer reader(direntData);
+ uint16_t mimeType = reader.read<uint16_t>();
+ bool redirect = (mimeType == Dirent::redirectMimeType);
+ bool linktarget = (mimeType == Dirent::linktargetMimeType);
+ bool deleted = (mimeType == Dirent::deletedMimeType);
+ uint8_t extraLen = reader.read<uint8_t>();
+ char ns = reader.read<char>();
+ uint32_t version = reader.read<uint32_t>();
+ dirent.setVersion(version);
+
+ if (redirect)
+ {
+ entry_index_type redirectIndex(reader.read<entry_index_type>());
+
+ log_debug("redirectIndex=" << redirectIndex);
+
+ dirent.setRedirect(entry_index_t(redirectIndex));
+ }
+ else if (linktarget || deleted)
+ {
+ log_debug("linktarget or deleted entry");
+ dirent.setItem(mimeType, cluster_index_t(0), blob_index_t(0));
+ }
+ else
+ {
+ log_debug("read article entry");
+
+ uint32_t clusterNumber = reader.read<uint32_t>();
+ uint32_t blobNumber = reader.read<uint32_t>();
+
+ log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber);
+
+ dirent.setItem(mimeType, cluster_index_t(clusterNumber), blob_index_t(blobNumber));
+ }
+
+ std::string url;
+ std::string title;
+ std::string parameter;
+
+ log_debug("read url, title and parameters");
+
+ size_type url_size = strnlen(
+ reader.current(),
+ reader.left().v - extraLen
+ );
+ if (url_size >= reader.left().v) {
+ return false;
+ }
+ url = std::string(reader.current(), url_size);
+ reader.skip(zsize_t(url_size+1));
+
+ size_type title_size = strnlen(
+ reader.current(),
+ reader.left().v - extraLen
+ );
+ if (title_size >= reader.left().v) {
+ return false;
+ }
+ title = std::string(reader.current(), title_size);
+ reader.skip(zsize_t(title_size+1));
+
+ if (extraLen > reader.left().v) {
+ return false;
+ }
+ parameter = std::string(reader.current(), extraLen);
+ dirent.setUrl(ns, url);
+ dirent.setTitle(title);
+ dirent.setParameter(parameter);
+ return true;
+ }
+
+ std::shared_ptr<const Dirent> DirentReader::readDirent(offset_t offset)
+ {
+ const auto totalSize = mp_zimReader->size();
+ if (offset.v >= totalSize.v) {
+ throw ZimFileFormatError("Invalid dirent pointer");
+ }
+
+ // We don't know the size of the dirent because it depends of the size of
+ // the title, url and extra parameters.
+ // This is a pity but we have no choice.
+ // We cannot take a buffer of the size of the file, it would be really
+ // inefficient. Let's do try, catch and retry while chosing a smart value
+ // for the buffer size. Most dirent will be "Article" entry (header's size
+ // == 16) without extra parameters. Let's hope that url + title size will
+ // be < 256 and if not try again with a bigger size.
+
+ size_t bufferSize(std::min(size_type(256), mp_zimReader->size().v-offset.v));
+ auto dirent = std::make_shared<Dirent>();
+ std::lock_guard<std::mutex> lock(m_bufferMutex);
+ for ( ; ; bufferSize += 256 ) {
+ m_buffer.reserve(bufferSize);
+ mp_zimReader->read(m_buffer.data(), offset, zsize_t(bufferSize));
+ if ( initDirent(*dirent, Buffer::makeBuffer(m_buffer.data(), zsize_t(bufferSize))) )
+ return dirent;
+ }
+ }
+
+ std::string Dirent::getLongUrl() const
+ {
+ log_trace("Dirent::getLongUrl()");
+ log_debug("namespace=" << getNamespace() << " title=" << getTitle());
+
+ return std::string(1, getNamespace()) + '/' + getUrl();
+ }
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "dirent_accessor.h"
+
+#include "direntreader.h"
+#include "_dirent.h"
+#include "envvalue.h"
+
+#include <mutex>
+
+#include <zim/error.h>
+
+using namespace zim;
+
+DirectDirentAccessor::DirectDirentAccessor(std::shared_ptr<DirentReader> direntReader, std::unique_ptr<const Reader> urlPtrReader, entry_index_t direntCount)
+ : mp_direntReader(direntReader),
+ mp_urlPtrReader(std::move(urlPtrReader)),
+ m_direntCount(direntCount),
+ m_direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)),
+ m_bufferDirentZone(256)
+{}
+
+std::shared_ptr<const Dirent> DirectDirentAccessor::getDirent(entry_index_t idx) const
+{
+ {
+ std::lock_guard<std::mutex> l(m_direntCacheLock);
+ auto v = m_direntCache.get(idx.v);
+ if (v.hit()) {
+ return v.value();
+ }
+ }
+
+ auto direntOffset = getOffset(idx);
+ auto dirent = readDirent(direntOffset);
+ std::lock_guard<std::mutex> l(m_direntCacheLock);
+ m_direntCache.put(idx.v, dirent);
+
+ return dirent;
+}
+
+offset_t DirectDirentAccessor::getOffset(entry_index_t idx) const
+{
+ if (idx >= m_direntCount) {
+ throw std::out_of_range("entry index out of range");
+ }
+ offset_t offset(mp_urlPtrReader->read_uint<offset_type>(offset_t(sizeof(offset_type)*idx.v)));
+ return offset;
+}
+
+std::shared_ptr<const Dirent> DirectDirentAccessor::readDirent(offset_t offset) const
+{
+ return mp_direntReader->readDirent(offset);
+}
+
+
+IndirectDirentAccessor::IndirectDirentAccessor(std::shared_ptr<const DirectDirentAccessor> direntAccessor, std::unique_ptr<const Reader> indexReader, title_index_t direntCount)
+ : mp_direntAccessor(direntAccessor),
+ mp_indexReader(std::move(indexReader)),
+ m_direntCount(direntCount)
+{}
+
+entry_index_t IndirectDirentAccessor::getDirectIndex(title_index_t idx) const
+{
+ if (idx >= m_direntCount) {
+ throw std::out_of_range("entry index out of range");
+ }
+ entry_index_t index(mp_indexReader->read_uint<entry_index_type>(offset_t(sizeof(entry_index_t)*idx.v)));
+ return index;
+}
+
+std::shared_ptr<const Dirent> IndirectDirentAccessor::getDirent(title_index_t idx) const
+{
+ auto directIndex = getDirectIndex(idx);
+ return mp_direntAccessor->getDirent(directIndex);
+}
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_DIRENT_ACCESSOR_H
+#define ZIM_DIRENT_ACCESSOR_H
+
+#include "zim_types.h"
+#include "debug.h"
+#include "lrucache.h"
+
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace zim
+{
+
+class Dirent;
+class Reader;
+class DirentReader;
+
+/**
+ * DirectDirentAccessor is used to access a dirent from its index.
+ * It doesn't provide any "advanced" features like lookup or find.
+ *
+ * This is the base class to locate a dirent (offset) and read it.
+ *
+ */
+
+class DirectDirentAccessor
+{
+public: // functions
+ DirectDirentAccessor(std::shared_ptr<DirentReader> direntReader, std::unique_ptr<const Reader> urlPtrReader, entry_index_t direntCount);
+
+ offset_t getOffset(entry_index_t idx) const;
+ std::shared_ptr<const Dirent> getDirent(entry_index_t idx) const;
+ entry_index_t getDirentCount() const { return m_direntCount; }
+
+private: // functions
+ std::shared_ptr<const Dirent> readDirent(offset_t) const;
+
+private: // data
+ std::shared_ptr<DirentReader> mp_direntReader;
+ std::unique_ptr<const Reader> mp_urlPtrReader;
+ entry_index_t m_direntCount;
+
+ mutable lru_cache<entry_index_type, std::shared_ptr<const Dirent>> m_direntCache;
+ mutable std::mutex m_direntCacheLock;
+
+ mutable std::vector<char> m_bufferDirentZone;
+ mutable std::mutex m_bufferDirentLock;
+};
+
+class IndirectDirentAccessor
+{
+ public:
+ IndirectDirentAccessor(std::shared_ptr<const DirectDirentAccessor>, std::unique_ptr<const Reader> indexReader, title_index_t direntCount);
+
+ entry_index_t getDirectIndex(title_index_t idx) const;
+ std::shared_ptr<const Dirent> getDirent(title_index_t idx) const;
+ title_index_t getDirentCount() const { return m_direntCount; }
+
+ private: // data
+ std::shared_ptr<const DirectDirentAccessor> mp_direntAccessor;
+ std::unique_ptr<const Reader> mp_indexReader;
+ title_index_t m_direntCount;
+};
+
+} // namespace zim
+
+#endif // ZIM_DIRENT_ACCESSOR_H
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_DIRENT_LOOKUP_H
+#define ZIM_DIRENT_LOOKUP_H
+
+#include "zim_types.h"
+#include "debug.h"
+#include "narrowdown.h"
+
+#include <algorithm>
+#include <map>
+#include <mutex>
+#include <vector>
+
+namespace zim
+{
+
+template<class Impl>
+class DirentLookup
+{
+public: // types
+ typedef std::pair<bool, entry_index_t> Result;
+
+public: // functions
+ DirentLookup(const Impl* _impl, entry_index_type cacheEntryCount);
+
+ entry_index_t getNamespaceRangeBegin(char ns) const;
+ entry_index_t getNamespaceRangeEnd(char ns) const;
+
+ Result find(char ns, const std::string& url);
+
+private: // functions
+ std::string getDirentKey(entry_index_type i) const;
+
+private: // types
+ typedef std::map<char, entry_index_t> NamespaceBoundaryCache;
+
+private: // data
+ const Impl* impl = nullptr;
+
+ mutable NamespaceBoundaryCache namespaceBoundaryCache;
+ mutable std::mutex cacheAccessMutex;
+
+ entry_index_type direntCount = 0;
+ NarrowDown lookupGrid;
+};
+
+template<class Impl>
+std::string
+DirentLookup<Impl>::getDirentKey(entry_index_type i) const
+{
+ const auto d = impl->getDirent(entry_index_t(i));
+ return d->getNamespace() + d->getUrl();
+}
+
+template<class Impl>
+DirentLookup<Impl>::DirentLookup(const Impl* _impl, entry_index_type cacheEntryCount)
+{
+ ASSERT(impl == nullptr, ==, true);
+ impl = _impl;
+ direntCount = entry_index_type(impl->getDirentCount());
+ if ( direntCount )
+ {
+ const entry_index_type step = std::max(1u, direntCount/cacheEntryCount);
+ for ( entry_index_type i = 0; i < direntCount-1; i += step )
+ {
+ lookupGrid.add(getDirentKey(i), i, getDirentKey(i+1));
+ }
+ lookupGrid.close(getDirentKey(direntCount - 1), direntCount - 1);
+ }
+}
+
+template<typename IMPL>
+entry_index_t getNamespaceBeginOffset(IMPL& impl, char ch)
+{
+ ASSERT(ch, >=, 32);
+ ASSERT(ch, <=, 127);
+
+ entry_index_type lower = 0;
+ entry_index_type upper = entry_index_type(impl.getDirentCount());
+ auto d = impl.getDirent(entry_index_t(0));
+ while (upper - lower > 1)
+ {
+ entry_index_type m = lower + (upper - lower) / 2;
+ auto d = impl.getDirent(entry_index_t(m));
+ if (d->getNamespace() >= ch)
+ upper = m;
+ else
+ lower = m;
+ }
+
+ entry_index_t ret = entry_index_t(d->getNamespace() < ch ? upper : lower);
+ return ret;
+}
+
+template<typename IMPL>
+entry_index_t getNamespaceEndOffset(IMPL& impl, char ch)
+{
+ ASSERT(ch, >=, 32);
+ ASSERT(ch, <, 127);
+ return getNamespaceBeginOffset(impl, ch+1);
+}
+
+
+
+template<class Impl>
+entry_index_t
+DirentLookup<Impl>::getNamespaceRangeBegin(char ch) const
+{
+ ASSERT(ch, >=, 32);
+ ASSERT(ch, <=, 127);
+
+ {
+ std::lock_guard<std::mutex> lock(cacheAccessMutex);
+ const auto it = namespaceBoundaryCache.find(ch);
+ if (it != namespaceBoundaryCache.end())
+ return it->second;
+ }
+
+ auto ret = getNamespaceBeginOffset(*impl, ch);
+
+ std::lock_guard<std::mutex> lock(cacheAccessMutex);
+ namespaceBoundaryCache[ch] = ret;
+ return ret;
+}
+
+template<class Impl>
+entry_index_t
+DirentLookup<Impl>::getNamespaceRangeEnd(char ns) const
+{
+ return getNamespaceRangeBegin(ns+1);
+}
+
+template<typename Impl>
+typename DirentLookup<Impl>::Result
+DirentLookup<Impl>::find(char ns, const std::string& url)
+{
+ const auto r = lookupGrid.getRange(ns + url);
+ entry_index_type l(r.begin);
+ entry_index_type u(r.end);
+
+ if (l == u)
+ return {false, entry_index_t(l)};
+
+ while (true)
+ {
+ entry_index_type p = l + (u - l) / 2;
+ const auto d = impl->getDirent(entry_index_t(p));
+
+ const int c = ns < d->getNamespace() ? -1
+ : ns > d->getNamespace() ? 1
+ : url.compare(d->getUrl());
+
+ if (c < 0)
+ u = p;
+ else if (c > 0)
+ {
+ if ( l == p )
+ return {false, entry_index_t(u)};
+ l = p;
+ }
+ else
+ return {true, entry_index_t(p)};
+ }
+}
+
+} // namespace zim
+
+#endif // ZIM_DIRENT_LOOKUP_H
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_DIRENTREADER_H
+#define ZIM_DIRENTREADER_H
+
+#include "_dirent.h"
+#include "reader.h"
+
+#include <memory>
+#include <mutex>
+#include <vector>
+
+namespace zim
+{
+
+// Unlke FileReader and MemoryReader (which read data from a file and memory,
+// respectively), DirentReader is a helper class that reads Dirents (rather
+// than from a Dirent).
+class DirentReader
+{
+public: // functions
+ explicit DirentReader(std::shared_ptr<const Reader> zimReader)
+ : mp_zimReader(zimReader)
+ {}
+
+ std::shared_ptr<const Dirent> readDirent(offset_t offset);
+
+private: // functions
+ bool initDirent(Dirent& dirent, const Buffer& direntData) const;
+
+private: // data
+ std::shared_ptr<const Reader> mp_zimReader;
+ std::vector<char> m_buffer;
+ std::mutex m_bufferMutex;
+};
+
+} // namespace zim
+
+#endif // ZIM_DIRENTREADER_H
--- /dev/null
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ENDIAN_H
+#define ENDIAN_H
+
+#include <algorithm>
+#include <iostream>
+#include <zim/zim.h>
+
+namespace zim
+{
+
+template<typename T, size_t N>
+struct ToLittleEndianImpl;
+
+template<typename T>
+struct ToLittleEndianImpl<T, 2>{
+ static void write(const T& d, char* dst) {
+ uint16_t v = static_cast<uint16_t>(d);
+ dst[0] = static_cast<uint8_t>(v);
+ dst[1] = static_cast<uint8_t>(v>>8);
+ }
+};
+
+template<typename T>
+struct ToLittleEndianImpl<T, 4>{
+ static void write(const T& d, char* dst) {
+ uint32_t v = static_cast<uint32_t>(d);
+ dst[0] = static_cast<uint8_t>(v);
+ dst[1] = static_cast<uint8_t>(v>>8);
+ dst[2] = static_cast<uint8_t>(v>>16);
+ dst[3] = static_cast<uint8_t>(v>>24);
+}
+};
+
+template<typename T>
+struct ToLittleEndianImpl<T, 8>{
+ static void write(const T& d, char* dst) {
+ uint64_t v = static_cast<uint64_t>(d);
+ dst[0] = static_cast<uint8_t>(v);
+ dst[1] = static_cast<uint8_t>(v>>8);
+ dst[2] = static_cast<uint8_t>(v>>16);
+ dst[3] = static_cast<uint8_t>(v>>24);
+ dst[4] = static_cast<uint8_t>(v>>32);
+ dst[5] = static_cast<uint8_t>(v>>40);
+ dst[6] = static_cast<uint8_t>(v>>48);
+ dst[7] = static_cast<uint8_t>(v>>56);
+ }
+};
+
+////////////////////////////////////////////////////////////////////////
+template <typename T>
+inline void toLittleEndian(T d, char* dst)
+{
+ ToLittleEndianImpl<T, sizeof(T)>::write(d, dst);
+}
+
+template <typename T>
+inline T fromLittleEndian(const char* ptr)
+{
+ T ret = 0;
+ for(size_t i=0; i<sizeof(T); i++) {
+ ret |= (static_cast<T>(static_cast<uint8_t>(ptr[i])) << (i*8));
+ }
+ return ret;
+}
+
+}
+
+#endif // ENDIAN_H
+
--- /dev/null
+/*
+ * Copyright (C) 2021 Renaud Gaudin <rgaudin@gmail.com>
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/entry.h>
+#include <zim/error.h>
+#include <zim/item.h>
+#include "_dirent.h"
+#include "fileimpl.h"
+#include "file_part.h"
+#include "log.h"
+
+#include <sstream>
+
+log_define("zim.entry")
+
+using namespace zim;
+
+Entry::Entry(std::shared_ptr<FileImpl> file, entry_index_type idx)
+ : m_file(file),
+ m_idx(idx),
+ m_dirent(file->getDirent(entry_index_t(idx)))
+{}
+
+std::string Entry::getTitle() const
+{
+ return m_dirent->getTitle();
+}
+
+std::string Entry::getPath() const
+{
+ if (m_file->hasNewNamespaceScheme()) {
+ return m_dirent->getUrl();
+ } else {
+ return m_dirent->getLongUrl();
+ }
+}
+
+bool Entry::isRedirect() const
+{
+ return m_dirent->isRedirect();
+}
+
+Item Entry::getItem(bool follow) const
+{
+ if (isRedirect()) {
+ if (! follow) {
+ std::ostringstream sstream;
+ sstream << "Entry " << getPath() << " is a redirect entry.";
+ throw InvalidType(sstream.str());
+ }
+ return getRedirect();
+ }
+
+ return Item(m_file, m_idx);
+}
+
+Item Entry::getRedirect() const {
+ auto nextEntry = getRedirectEntry();
+ auto watchdog = 50U;
+ while (nextEntry.isRedirect() && --watchdog) {
+ nextEntry = nextEntry.getRedirectEntry();
+ }
+ return nextEntry.getItem(false);
+}
+
+Entry Entry::getRedirectEntry() const {
+ if (!isRedirect()) {
+ std::ostringstream sstream;
+ sstream << "Entry " << getPath() << " is not a redirect entry.";
+ throw InvalidType(sstream.str());
+ }
+ return Entry(m_file, static_cast<entry_index_type>(m_dirent->getRedirectIndex()));
+}
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <sstream>
+#include <stdlib.h>
+
+namespace zim
+{
+ unsigned envValue(const char* env, unsigned def)
+ {
+ const char* v = ::getenv(env);
+ if (v)
+ {
+ std::istringstream s(v);
+ s >> def;
+ }
+ return def;
+ }
+
+ unsigned envMemSize(const char* env, unsigned def)
+ {
+ const char* v = ::getenv(env);
+ if (v)
+ {
+ char unit = '\0';
+ std::istringstream s(v);
+ s >> def >> unit;
+
+ switch (unit)
+ {
+ case 'k':
+ case 'K': def *= 1024; break;
+ case 'm':
+ case 'M': def *= 1024 * 1024; break;
+ case 'g':
+ case 'G': def *= 1024 * 1024 * 1024; break;
+ }
+ }
+ return def;
+ }
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ENVVALUE_H
+#define ZIM_ENVVALUE_H
+
+namespace zim
+{
+ unsigned envValue(const char* env, unsigned def);
+ unsigned envMemSize(const char* env, unsigned def);
+}
+
+#endif // ZIM_ENVVALUE_H
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Veloman Yunkan
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "file_compound.h"
+#include "buffer.h"
+
+#include <errno.h>
+#include <string.h>
+#include <sstream>
+#include <sys/stat.h>
+
+#ifdef _WIN32
+# include <io.h>
+#else
+# include <unistd.h>
+#endif
+
+namespace zim {
+
+void FileCompound::addPart(FilePart* fpart)
+{
+ const Range newRange(offset_t(_fsize.v), offset_t((_fsize+fpart->size()).v));
+ emplace(newRange, fpart);
+ _fsize += fpart->size();
+}
+
+FileCompound::FileCompound(const std::string& filename):
+ _filename(filename),
+ _fsize(0)
+{
+ try {
+ addPart(new FilePart(filename));
+ } catch(...) {
+ int errnoSave = errno;
+ _fsize = zsize_t(0);
+ try {
+ for (char ch0 = 'a'; ch0 <= 'z'; ++ch0)
+ {
+ const std::string fname0 = filename + ch0;
+ for (char ch1 = 'a'; ch1 <= 'z'; ++ch1)
+ {
+ addPart(new FilePart(fname0 + ch1));
+ }
+ }
+ } catch (...) { }
+
+ if (empty())
+ {
+ std::ostringstream msg;
+ msg << "error " << errnoSave << " opening file \"" << filename;
+ throw std::runtime_error(msg.str());
+ }
+ }
+}
+
+#ifndef _WIN32
+FileCompound::FileCompound(int fd):
+ _filename(),
+ _fsize(0)
+{
+ addPart(new FilePart(fd));
+}
+#endif
+
+FileCompound::~FileCompound() {
+ for(auto it=begin(); it!=end(); it++) {
+ auto filepart = it->second;
+ delete filepart;
+ }
+}
+
+time_t FileCompound::getMTime() const {
+ if (mtime || empty())
+ return mtime;
+
+ const char* fname = begin()->second->filename().c_str();
+
+ #if defined(HAVE_STAT64) && ! defined(__APPLE__)
+ struct stat64 st;
+ int ret = ::stat64(fname, &st);
+ #else
+ struct stat st;
+ int ret = ::stat(fname, &st);
+ #endif
+ if (ret != 0)
+ {
+ std::ostringstream msg;
+ msg << "stat failed with errno " << errno << " : " << strerror(errno);
+ throw std::runtime_error(msg.str());
+ }
+ mtime = st.st_mtime;
+
+ return mtime;
+
+}
+
+} // zim
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Veloman Yunkan
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILE_COMPOUND_H_
+#define ZIM_FILE_COMPOUND_H_
+
+#include "file_part.h"
+#include "zim_types.h"
+#include "debug.h"
+#include <map>
+#include <memory>
+#include <cstdio>
+
+namespace zim {
+
+struct Range {
+ Range(const offset_t min, const offset_t max)
+ : min(min), max(max)
+ {
+ // ASSERT(min, <, max);
+ }
+
+ const offset_t min;
+ const offset_t max;
+};
+
+struct less_range : public std::binary_function< Range, Range, bool>
+{
+ bool operator()(const Range& lhs, const Range& rhs) const {
+ return lhs.min < rhs.min && lhs.max <= rhs.min;
+ }
+};
+
+class FileCompound : private std::map<Range, FilePart*, less_range> {
+ typedef std::map<Range, FilePart*, less_range> ImplType;
+
+ public: // types
+ typedef const_iterator PartIterator;
+ typedef std::pair<PartIterator, PartIterator> PartRange;
+
+ public: // functions
+ explicit FileCompound(const std::string& filename);
+
+#ifndef _WIN32
+ explicit FileCompound(int fd);
+#endif
+
+ ~FileCompound();
+
+ using ImplType::begin;
+ using ImplType::end;
+
+ const std::string& filename() const { return _filename; }
+ zsize_t fsize() const { return _fsize; };
+ time_t getMTime() const;
+ bool fail() const { return empty(); };
+ bool is_multiPart() const { return size() > 1; };
+
+ PartIterator locate(offset_t offset) const {
+ const PartIterator partIt = lower_bound(Range(offset, offset));
+ ASSERT(partIt != end(), ==, true);
+ return partIt;
+ }
+
+ PartRange locate(offset_t offset, zsize_t size) const {
+#if ! defined(__APPLE__)
+ return equal_range(Range(offset, offset+size));
+#else
+ // Workaround for https://github.com/openzim/libzim/issues/398
+ // Under MacOS the implementation of std::map::equal_range() makes
+ // assumptions about the properties of the key comparison function and
+ // abuses the std::map requirement that it must contain unique keys. As
+ // a result, when a map m is queried with an element k that is
+ // equivalent to more than one keys present in m,
+ // m.equal_range(k).first may be different from m.lower_bound(k) (the
+ // latter one returning the correct result).
+ const Range queryRange(offset, offset+size);
+ return {lower_bound(queryRange), upper_bound(queryRange)};
+#endif // ! defined(__APPLE__)
+ }
+
+ private: // functions
+ void addPart(FilePart* fpart);
+
+ private: // data
+ std::string _filename;
+ zsize_t _fsize;
+ mutable time_t mtime;
+};
+
+
+};
+
+
+#endif //ZIM_FILE_COMPOUND_H_
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Veloman Yunkan
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILE_PART_H_
+#define ZIM_FILE_PART_H_
+
+#include <string>
+#include <cstdio>
+#include <memory>
+
+#include <zim/zim.h>
+
+#include "zim_types.h"
+#include "fs.h"
+
+namespace zim {
+
+class FilePart {
+ typedef DEFAULTFS FS;
+
+ public:
+ using FDSharedPtr = std::shared_ptr<FS::FD>;
+
+ public:
+ FilePart(const std::string& filename) :
+ m_filename(filename),
+ m_fhandle(std::make_shared<FS::FD>(FS::openFile(filename))),
+ m_size(m_fhandle->getSize()) {}
+
+#ifndef _WIN32
+ FilePart(int fd) :
+ FilePart(getFilePathFromFD(fd)) {}
+#endif
+
+ ~FilePart() = default;
+ const std::string& filename() const { return m_filename; };
+ const FS::FD& fhandle() const { return *m_fhandle; };
+ const FDSharedPtr& shareable_fhandle() const { return m_fhandle; };
+
+ zsize_t size() const { return m_size; };
+ bool fail() const { return !m_size; };
+ bool good() const { return bool(m_size); };
+
+ private:
+ const std::string m_filename;
+ FDSharedPtr m_fhandle;
+ zsize_t m_size;
+};
+
+};
+
+#endif //ZIM_FILE_PART_H_
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/zim.h>
+#include <zim/error.h>
+#include "file_reader.h"
+#include "file_compound.h"
+#include "buffer.h"
+#include <errno.h>
+#include <string.h>
+#include <cstring>
+#include <fcntl.h>
+#include <sstream>
+#include <system_error>
+#include <algorithm>
+
+
+#ifndef _WIN32
+# include <sys/mman.h>
+# include <unistd.h>
+#endif
+
+#if defined(_MSC_VER)
+# include <io.h>
+# include <BaseTsd.h>
+ typedef SSIZE_T ssize_t;
+#endif
+
+namespace zim {
+
+////////////////////////////////////////////////////////////////////////////////
+// MultiPartFileReader
+////////////////////////////////////////////////////////////////////////////////
+
+MultiPartFileReader::MultiPartFileReader(std::shared_ptr<const FileCompound> source)
+ : MultiPartFileReader(source, offset_t(0), source->fsize()) {}
+
+MultiPartFileReader::MultiPartFileReader(std::shared_ptr<const FileCompound> source, offset_t offset, zsize_t size)
+ : source(source),
+ _offset(offset),
+ _size(size)
+{
+ ASSERT(offset.v, <=, source->fsize().v);
+ ASSERT(offset.v+size.v, <=, source->fsize().v);
+}
+
+char MultiPartFileReader::read(offset_t offset) const {
+ ASSERT(offset.v, <, _size.v);
+ offset += _offset;
+ auto part_pair = source->locate(offset);
+ auto& fhandle = part_pair->second->fhandle();
+ offset_t local_offset = offset - part_pair->first.min;
+ ASSERT(local_offset, <=, part_pair->first.max);
+ char ret;
+ try {
+ fhandle.readAt(&ret, zsize_t(1), local_offset);
+ } catch (std::runtime_error& e) {
+ //Error while reading.
+ std::ostringstream s;
+ s << "Cannot read a char.\n";
+ s << " - File part is " << part_pair->second->filename() << "\n";
+ s << " - File part size is " << part_pair->second->size().v << "\n";
+ s << " - File part range is " << part_pair->first.min << "-" << part_pair->first.max << "\n";
+ s << " - Reading offset at " << offset.v << "\n";
+ s << " - local offset is " << local_offset.v << "\n";
+ s << " - error is " << strerror(errno) << "\n";
+ std::error_code ec(errno, std::generic_category());
+ throw std::system_error(ec, s.str());
+ };
+ return ret;
+}
+
+void MultiPartFileReader::read(char* dest, offset_t offset, zsize_t size) const {
+ ASSERT(offset.v, <=, _size.v);
+ ASSERT(offset.v+size.v, <=, _size.v);
+ if (! size ) {
+ return;
+ }
+ offset += _offset;
+ auto found_range = source->locate(offset, size);
+ for(auto current = found_range.first; current!=found_range.second; current++){
+ auto part = current->second;
+ Range partRange = current->first;
+ offset_t local_offset = offset-partRange.min;
+ ASSERT(size.v, >, 0U);
+ zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-local_offset.v));
+ try {
+ part->fhandle().readAt(dest, size_to_get, local_offset);
+ } catch (std::runtime_error& e) {
+ std::ostringstream s;
+ s << "Cannot read chars.\n";
+ s << " - File part is " << part->filename() << "\n";
+ s << " - File part size is " << part->size().v << "\n";
+ s << " - File part range is " << partRange.min << "-" << partRange.max << "\n";
+ s << " - size_to_get is " << size_to_get.v << "\n";
+ s << " - total size is " << size.v << "\n";
+ s << " - Reading offset at " << offset.v << "\n";
+ s << " - local offset is " << local_offset.v << "\n";
+ s << " - error is " << strerror(errno) << "\n";
+ std::error_code ec(errno, std::generic_category());
+ throw std::system_error(ec, s.str());
+ };
+ ASSERT(size_to_get, <=, size);
+ dest += size_to_get.v;
+ size -= size_to_get;
+ offset += size_to_get;
+ }
+ ASSERT(size.v, ==, 0U);
+}
+
+#ifdef ENABLE_USE_MMAP
+namespace
+{
+
+class MMapException : std::exception {};
+
+char*
+mmapReadOnly(int fd, offset_type offset, size_type size)
+{
+#if defined(__APPLE__) || defined(__OpenBSD__)
+ const auto MAP_FLAGS = MAP_PRIVATE;
+#elif defined(__FreeBSD__)
+ const auto MAP_FLAGS = MAP_PRIVATE|MAP_PREFAULT_READ;
+#else
+ const auto MAP_FLAGS = MAP_PRIVATE|MAP_POPULATE;
+#endif
+
+ const auto p = (char*)mmap(NULL, size, PROT_READ, MAP_FLAGS, fd, offset);
+ if (p == MAP_FAILED )
+ {
+ std::ostringstream s;
+ s << "Cannot mmap size " << size << " at off " << offset
+ << " : " << strerror(errno);
+ throw std::runtime_error(s.str());
+ }
+ return p;
+}
+
+Buffer::DataPtr
+makeMmappedBuffer(int fd, offset_t offset, zsize_t size)
+{
+ const offset_type pageAlignedOffset(offset.v & ~(sysconf(_SC_PAGE_SIZE) - 1));
+ const size_t alignmentAdjustment = offset.v - pageAlignedOffset;
+ size += alignmentAdjustment;
+
+#if !MMAP_SUPPORT_64
+ if(pageAlignedOffset >= INT32_MAX) {
+ throw MMapException();
+ }
+#endif
+ char* const mmappedAddress = mmapReadOnly(fd, pageAlignedOffset, size.v);
+ const auto munmapDeleter = [mmappedAddress, size](char* ) {
+ munmap(mmappedAddress, size.v);
+ };
+
+ return Buffer::DataPtr(mmappedAddress+alignmentAdjustment, munmapDeleter);
+}
+
+} // unnamed namespace
+#endif // ENABLE_USE_MMAP
+
+const Buffer MultiPartFileReader::get_buffer(offset_t offset, zsize_t size) const {
+ ASSERT(size, <=, _size);
+#ifdef ENABLE_USE_MMAP
+ try {
+ auto found_range = source->locate(_offset+offset, size);
+ auto first_part_containing_it = found_range.first;
+ if (++first_part_containing_it != found_range.second) {
+ throw MMapException();
+ }
+
+ // The range is in only one part
+ auto range = found_range.first->first;
+ auto part = found_range.first->second;
+ auto local_offset = offset + _offset - range.min;
+ ASSERT(size, <=, part->size());
+ int fd = part->fhandle().getNativeHandle();
+ return Buffer::makeBuffer(makeMmappedBuffer(fd, local_offset, size), size);
+ } catch(MMapException& e)
+#endif
+ {
+ // The range is several part, or we are on Windows.
+ // We will have to do some memory copies :/
+ // [TODO] Use Windows equivalent for mmap.
+ auto ret_buffer = Buffer::makeBuffer(size);
+ read(const_cast<char*>(ret_buffer.data()), offset, size);
+ return ret_buffer;
+ }
+}
+
+bool Reader::can_read(offset_t offset, zsize_t size) const
+{
+ return (offset.v <= this->size().v && (offset.v+size.v) <= this->size().v);
+}
+
+
+std::unique_ptr<const Reader> MultiPartFileReader::sub_reader(offset_t offset, zsize_t size) const
+{
+ ASSERT(offset.v+size.v, <=, _size.v);
+ // TODO: can use a FileReader here if the new range fully belongs to a single part
+ return std::unique_ptr<Reader>(new MultiPartFileReader(source, _offset+offset, size));
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// FileReader
+////////////////////////////////////////////////////////////////////////////////
+
+FileReader::FileReader(FileHandle fh, offset_t offset, zsize_t size)
+ : _fhandle(fh)
+ , _offset(offset)
+ , _size(size)
+{
+}
+
+char FileReader::read(offset_t offset) const
+{
+ ASSERT(offset.v, <, _size.v);
+ offset += _offset;
+ char ret;
+ try {
+ _fhandle->readAt(&ret, zsize_t(1), offset);
+ } catch (std::runtime_error& e) {
+ //Error while reading.
+ std::ostringstream s;
+ s << "Cannot read a char.\n";
+ s << " - Reading offset at " << offset.v << "\n";
+ s << " - error is " << strerror(errno) << "\n";
+ std::error_code ec(errno, std::generic_category());
+ throw std::system_error(ec, s.str());
+ };
+ return ret;
+}
+
+void FileReader::read(char* dest, offset_t offset, zsize_t size) const
+{
+ ASSERT(offset.v, <=, _size.v);
+ ASSERT(offset.v+size.v, <=, _size.v);
+ if (! size ) {
+ return;
+ }
+ offset += _offset;
+ try {
+ _fhandle->readAt(dest, size, offset);
+ } catch (std::runtime_error& e) {
+ std::ostringstream s;
+ s << "Cannot read chars.\n";
+ s << " - Reading offset at " << offset.v << "\n";
+ s << " - size is " << size.v << "\n";
+ s << " - error is " << strerror(errno) << "\n";
+ std::error_code ec(errno, std::generic_category());
+ throw std::system_error(ec, s.str());
+ };
+}
+
+const Buffer FileReader::get_buffer(offset_t offset, zsize_t size) const
+{
+ ASSERT(size, <=, _size);
+#ifdef ENABLE_USE_MMAP
+ offset += _offset;
+ int fd = _fhandle->getNativeHandle();
+ return Buffer::makeBuffer(makeMmappedBuffer(fd, offset, size), size);
+#else // We are on Windows. [TODO] Use Windows equivalent for mmap.
+ auto ret_buffer = Buffer::makeBuffer(size);
+ read(const_cast<char*>(ret_buffer.data()), offset, size);
+ return ret_buffer;
+#endif
+}
+
+std::unique_ptr<const Reader>
+FileReader::sub_reader(offset_t offset, zsize_t size) const
+{
+ ASSERT(offset.v+size.v, <=, _size.v);
+ return std::unique_ptr<const Reader>(new FileReader(_fhandle, _offset + offset, size));
+}
+
+} // zim
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILE_READER_H_
+#define ZIM_FILE_READER_H_
+
+#include "reader.h"
+#include "fs.h"
+
+namespace zim {
+
+class FileCompound;
+
+class FileReader : public Reader {
+ public: // types
+ typedef std::shared_ptr<const DEFAULTFS::FD> FileHandle;
+
+ public: // functions
+ explicit FileReader(FileHandle fh, offset_t offset, zsize_t size);
+ ~FileReader() = default;
+
+ zsize_t size() const { return _size; };
+ offset_t offset() const { return _offset; };
+
+ char read(offset_t offset) const;
+ void read(char* dest, offset_t offset, zsize_t size) const;
+ const Buffer get_buffer(offset_t offset, zsize_t size) const;
+
+ std::unique_ptr<const Reader> sub_reader(offset_t offset, zsize_t size) const;
+
+ private: // data
+ // The file handle is stored via a shared pointer so that it can be shared
+ // by a sub_reader (otherwise the file handle would be invalidated by
+ // FD destructor when the sub-reader is destroyed).
+ FileHandle _fhandle;
+ offset_t _offset;
+ zsize_t _size;
+};
+
+class MultiPartFileReader : public Reader {
+ public:
+ MultiPartFileReader(std::shared_ptr<const FileCompound> source);
+ ~MultiPartFileReader() {};
+
+ zsize_t size() const { return _size; };
+ offset_t offset() const { return _offset; };
+
+ char read(offset_t offset) const;
+ void read(char* dest, offset_t offset, zsize_t size) const;
+ const Buffer get_buffer(offset_t offset, zsize_t size) const;
+
+ std::unique_ptr<const Reader> sub_reader(offset_t offset, zsize_t size) const;
+
+ private:
+ MultiPartFileReader(std::shared_ptr<const FileCompound> source, offset_t offset, zsize_t size);
+
+ std::shared_ptr<const FileCompound> source;
+ offset_t _offset;
+ zsize_t _size;
+};
+
+};
+
+#endif // ZIM_FILE_READER_H_
--- /dev/null
+/*
+ * Copyright (C) 2017-2020 Mattieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2008 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "fileheader.h"
+#include <zim/error.h>
+#include <iostream>
+#include <algorithm>
+#include "log.h"
+#include "endian_tools.h"
+#include "reader.h"
+#include "bufferstreamer.h"
+#include "buffer.h"
+#ifdef _WIN32
+# include "io.h"
+#else
+# include "unistd.h"
+# define _write(fd, addr, size) ::write((fd), (addr), (size))
+#endif
+
+log_define("zim.file.header")
+
+namespace zim
+{
+ const uint32_t Fileheader::zimMagic = 0x044d495a; // ="ZIM^d"
+ const uint16_t Fileheader::zimOldMajorVersion = 5;
+ const uint16_t Fileheader::zimMajorVersion = 6;
+ const uint16_t Fileheader::zimMinorVersion = 1;
+ const offset_type Fileheader::size = 80; // This is also mimeListPos (so an offset)
+
+ void Fileheader::write(int out_fd) const
+ {
+ char header[Fileheader::size];
+ toLittleEndian(Fileheader::zimMagic, header);
+ toLittleEndian(getMajorVersion(), header + 4);
+ toLittleEndian(getMinorVersion(), header + 6);
+ std::copy(getUuid().data, getUuid().data + sizeof(Uuid), header + 8);
+ toLittleEndian(getArticleCount(), header + 24);
+ toLittleEndian(getClusterCount(), header + 28);
+ toLittleEndian(getUrlPtrPos(), header + 32);
+ toLittleEndian(getTitleIdxPos(), header + 40);
+ toLittleEndian(getClusterPtrPos(), header + 48);
+ toLittleEndian(getMimeListPos(), header + 56);
+ toLittleEndian(getMainPage(), header + 64);
+ toLittleEndian(getLayoutPage(), header + 68);
+ toLittleEndian(getChecksumPos(), header + 72);
+
+ auto ret = _write(out_fd, header, Fileheader::size);
+ if (ret != Fileheader::size) {
+ std::cerr << "Error Writing" << std::endl;
+ std::cerr << "Ret is " << ret << std::endl;
+ perror("Error writing");
+ throw std::runtime_error("Error writing");
+ }
+ }
+
+ void Fileheader::read(const Reader& reader)
+ {
+ auto buffer = reader.get_buffer(offset_t(0), zsize_t(Fileheader::size));
+ auto seqReader = BufferStreamer(buffer);
+ uint32_t magicNumber = seqReader.read<uint32_t>();
+ if (magicNumber != Fileheader::zimMagic)
+ {
+ log_error("invalid magic number " << magicNumber << " found - "
+ << Fileheader::zimMagic << " expected");
+ throw ZimFileFormatError("Invalid magic number");
+ }
+
+ uint16_t major_version = seqReader.read<uint16_t>();
+ if (major_version != zimOldMajorVersion && major_version != zimMajorVersion)
+ {
+ log_error("invalid zimfile major version " << major_version << " found - "
+ << Fileheader::zimMajorVersion << " expected");
+ throw ZimFileFormatError("Invalid version");
+ }
+ setMajorVersion(major_version);
+
+ setMinorVersion(seqReader.read<uint16_t>());
+
+ Uuid uuid;
+ std::copy(seqReader.current(), seqReader.current()+16, uuid.data);
+ seqReader.skip(zsize_t(16));
+ setUuid(uuid);
+
+ setArticleCount(seqReader.read<uint32_t>());
+ setClusterCount(seqReader.read<uint32_t>());
+ setUrlPtrPos(seqReader.read<uint64_t>());
+ setTitleIdxPos(seqReader.read<uint64_t>());
+ setClusterPtrPos(seqReader.read<uint64_t>());
+ setMimeListPos(seqReader.read<uint64_t>());
+ setMainPage(seqReader.read<uint32_t>());
+ setLayoutPage(seqReader.read<uint32_t>());
+ setChecksumPos(seqReader.read<uint64_t>());
+
+ sanity_check();
+ }
+
+ void Fileheader::sanity_check() const {
+ if (!!articleCount != !!clusterCount) {
+ throw ZimFileFormatError("No article <=> No cluster");
+ }
+
+ if (mimeListPos != size && mimeListPos != 72) {
+ throw ZimFileFormatError("mimelistPos must be 80.");
+ }
+
+ if (urlPtrPos < mimeListPos) {
+ throw ZimFileFormatError("urlPtrPos must be > mimelistPos.");
+ }
+ if (titleIdxPos < mimeListPos) {
+ throw ZimFileFormatError("titleIdxPos must be > mimelistPos.");
+ }
+ if (clusterPtrPos < mimeListPos) {
+ throw ZimFileFormatError("clusterPtrPos must be > mimelistPos.");
+ }
+
+ if (clusterCount > articleCount) {
+ throw ZimFileFormatError("Cluster count cannot be higher than article count.");
+ }
+
+ if (checksumPos != 0 && checksumPos < mimeListPos) {
+ throw ZimFileFormatError("checksumPos must be > mimeListPos.");
+ }
+ }
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2017-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2008 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILEHEADER_H
+#define ZIM_FILEHEADER_H
+
+#include <memory>
+#include <zim/zim.h>
+#include <zim/uuid.h>
+#include <iosfwd>
+#include <limits>
+
+// max may be defined as a macro by window includes
+#ifdef max
+#undef max
+#endif
+
+namespace zim
+{
+ class Reader;
+ class Fileheader
+ {
+ public:
+ static const uint32_t zimMagic;
+ static const uint16_t zimOldMajorVersion;
+ static const uint16_t zimMajorVersion;
+ static const uint16_t zimMinorVersion;
+ static const size_type size;
+
+ private:
+ uint16_t majorVersion;
+ uint16_t minorVersion;
+ Uuid uuid;
+ entry_index_type articleCount;
+ offset_type titleIdxPos;
+ offset_type urlPtrPos;
+ offset_type mimeListPos;
+ cluster_index_type clusterCount;
+ offset_type clusterPtrPos;
+ entry_index_type mainPage;
+ entry_index_type layoutPage;
+ offset_type checksumPos;
+
+ public:
+ Fileheader()
+ : majorVersion(zimMajorVersion),
+ minorVersion(zimMinorVersion),
+ articleCount(0),
+ titleIdxPos(0),
+ urlPtrPos(0),
+ clusterCount(0),
+ clusterPtrPos(0),
+ mainPage(std::numeric_limits<entry_index_type>::max()),
+ layoutPage(std::numeric_limits<entry_index_type>::max()),
+ checksumPos(std::numeric_limits<offset_type>::max())
+ {}
+
+ void write(int out_fd) const;
+ void read(const Reader& reader);
+
+ // Do some sanity check, raise a ZimFileFormateError is
+ // something is wrong.
+ void sanity_check() const;
+
+ uint16_t getMajorVersion() const { return majorVersion; }
+ void setMajorVersion(uint16_t v) { majorVersion = v; }
+
+ uint16_t getMinorVersion() const { return minorVersion; }
+ void setMinorVersion(uint16_t v) { minorVersion = v; }
+
+ const Uuid& getUuid() const { return uuid; }
+ void setUuid(const Uuid& uuid_) { uuid = uuid_; }
+
+ entry_index_type getArticleCount() const { return articleCount; }
+ void setArticleCount(entry_index_type s) { articleCount = s; }
+
+ offset_type getTitleIdxPos() const { return titleIdxPos; }
+ void setTitleIdxPos(offset_type p) { titleIdxPos = p; }
+
+ offset_type getUrlPtrPos() const { return urlPtrPos; }
+ void setUrlPtrPos(offset_type p) { urlPtrPos = p; }
+
+ offset_type getMimeListPos() const { return mimeListPos; }
+ void setMimeListPos(offset_type p) { mimeListPos = p; }
+
+ cluster_index_type getClusterCount() const { return clusterCount; }
+ void setClusterCount(cluster_index_type s) { clusterCount = s; }
+
+ offset_type getClusterPtrPos() const { return clusterPtrPos; }
+ void setClusterPtrPos(offset_type p) { clusterPtrPos = p; }
+
+ bool hasMainPage() const { return mainPage != std::numeric_limits<entry_index_type>::max(); }
+ entry_index_type getMainPage() const { return mainPage; }
+ void setMainPage(entry_index_type s){ mainPage = s; }
+
+ bool hasLayoutPage() const { return layoutPage != std::numeric_limits<entry_index_type>::max(); }
+ entry_index_type getLayoutPage() const { return layoutPage; }
+ void setLayoutPage(entry_index_type s) { layoutPage = s; }
+
+ bool hasChecksum() const { return getMimeListPos() >= 80; }
+ offset_type getChecksumPos() const { return hasChecksum() ? checksumPos : 0; }
+ void setChecksumPos(offset_type p) { checksumPos = p; }
+ };
+
+}
+
+#endif // ZIM_FILEHEADER_H
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020-2021 Veloman Yunkan
+ * Copyright (C) 2006,2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "fileimpl.h"
+#include <zim/error.h>
+#include "_dirent.h"
+#include "file_compound.h"
+#include "buffer_reader.h"
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sstream>
+#include <errno.h>
+#include <cstring>
+#include <fstream>
+#include "config.h"
+#include "log.h"
+#include "envvalue.h"
+#include "md5.h"
+#include "tools.h"
+
+log_define("zim.file.impl")
+
+namespace zim
+{
+
+namespace
+{
+
+offset_t readOffset(const Reader& reader, entry_index_type idx)
+{
+ offset_t offset(reader.read_uint<offset_type>(offset_t(sizeof(offset_type)*idx)));
+ return offset;
+}
+
+std::unique_ptr<const Reader>
+sectionSubReader(const Reader& zimReader, const std::string& sectionName,
+ offset_t offset, zsize_t size)
+{
+ if (!zimReader.can_read(offset, size)) {
+ throw ZimFileFormatError(sectionName + " outside (or not fully inside) ZIM file.");
+ }
+#ifdef ENABLE_USE_BUFFER_HEADER
+ const auto buf = zimReader.get_buffer(offset, size);
+ return std::unique_ptr<Reader>(new BufferReader(buf));
+#else
+ return zimReader.sub_reader(offset, size);
+#endif
+}
+
+std::shared_ptr<Reader>
+makeFileReader(std::shared_ptr<const FileCompound> zimFile, offset_t offset, zsize_t size)
+{
+ if (zimFile->fail()) {
+ return nullptr;
+ } else if ( zimFile->is_multiPart() ) {
+ ASSERT(offset.v, ==, 0u);
+ ASSERT(size, ==, zimFile->fsize());
+ return std::make_shared<MultiPartFileReader>(zimFile);
+ } else {
+ const auto& firstAndOnlyPart = zimFile->begin()->second;
+ return std::make_shared<FileReader>(firstAndOnlyPart->shareable_fhandle(), offset, size);
+ }
+}
+
+} //unnamed namespace
+
+ //////////////////////////////////////////////////////////////////////
+ // FileImpl
+ //
+ FileImpl::FileImpl(const std::string& fname)
+ : FileImpl(std::make_shared<FileCompound>(fname))
+ {}
+
+#ifndef _WIN32
+ FileImpl::FileImpl(int fd)
+ : FileImpl(std::make_shared<FileCompound>(fd))
+ {}
+
+ FileImpl::FileImpl(int fd, offset_t offset, zsize_t size)
+ : FileImpl(std::make_shared<FileCompound>(fd), offset, size)
+ {}
+#endif
+
+ FileImpl::FileImpl(std::shared_ptr<FileCompound> _zimFile)
+ : FileImpl(_zimFile, offset_t(0), _zimFile->fsize())
+ {}
+
+ FileImpl::FileImpl(std::shared_ptr<FileCompound> _zimFile, offset_t offset, zsize_t size)
+ : zimFile(_zimFile),
+ archiveStartOffset(offset),
+ zimReader(makeFileReader(zimFile, offset, size)),
+ direntReader(new DirentReader(zimReader)),
+ clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)),
+ m_newNamespaceScheme(false),
+ m_hasFrontArticlesIndex(true),
+ m_startUserEntry(0),
+ m_endUserEntry(0)
+ {
+ log_trace("read file \"" << zimFile->filename() << '"');
+
+ if (zimFile->fail())
+ throw ZimFileFormatError(std::string("can't open zim-file \"") + zimFile->filename() + '"');
+
+ // read header
+ if (size_type(zimReader->size()) < Fileheader::size) {
+ throw ZimFileFormatError("zim-file is too small to contain a header");
+ }
+ try {
+ header.read(*zimReader);
+ } catch (ZimFileFormatError& e) {
+ throw e;
+ } catch (...) {
+ throw ZimFileFormatError("error reading zim-file header.");
+ }
+
+ auto urlPtrReader = sectionSubReader(*zimReader,
+ "Dirent pointer table",
+ offset_t(header.getUrlPtrPos()),
+ zsize_t(sizeof(offset_type)*header.getArticleCount()));
+
+ mp_urlDirentAccessor.reset(
+ new DirectDirentAccessor(direntReader, std::move(urlPtrReader), entry_index_t(header.getArticleCount())));
+
+
+ clusterOffsetReader = sectionSubReader(*zimReader,
+ "Cluster pointer table",
+ offset_t(header.getClusterPtrPos()),
+ zsize_t(sizeof(offset_type)*header.getClusterCount()));
+
+ quickCheckForCorruptFile();
+
+ mp_titleDirentAccessor = getTitleAccessor("listing/titleOrdered/v1");
+
+ if (!mp_titleDirentAccessor) {
+ offset_t titleOffset(header.getTitleIdxPos());
+ zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount());
+ mp_titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Title index table");
+ const_cast<bool&>(m_hasFrontArticlesIndex) = false;
+ }
+
+ readMimeTypes();
+ }
+
+ std::unique_ptr<IndirectDirentAccessor> FileImpl::getTitleAccessor(const std::string& path)
+ {
+ auto result = direntLookup().find('X', path);
+ if (!result.first) {
+ return nullptr;
+ }
+
+ auto dirent = mp_urlDirentAccessor->getDirent(result.second);
+ auto cluster = getCluster(dirent->getClusterNumber());
+ if (cluster->isCompressed()) {
+ // This is a ZimFileFormatError.
+ // Let's be tolerent and skip the entry
+ return nullptr;
+ }
+ auto titleOffset = getClusterOffset(dirent->getClusterNumber()) + cluster->getBlobOffset(dirent->getBlobNumber());
+ auto titleSize = cluster->getBlobSize(dirent->getBlobNumber());
+ return getTitleAccessor(titleOffset, titleSize, "Title index table" + path);
+ }
+
+ std::unique_ptr<IndirectDirentAccessor> FileImpl::getTitleAccessor(const offset_t offset, const zsize_t size, const std::string& name)
+ {
+ auto titleIndexReader = sectionSubReader(*zimReader,
+ name,
+ offset,
+ size);
+
+ return std::unique_ptr<IndirectDirentAccessor>(
+ new IndirectDirentAccessor(mp_urlDirentAccessor, std::move(titleIndexReader), title_index_t(size.v/sizeof(entry_index_type))));
+ }
+
+ FileImpl::DirentLookup& FileImpl::direntLookup() const
+ {
+ std::call_once(m_direntLookupOnceFlag, [this]{
+
+ const auto cacheSize = envValue("ZIM_DIRENTLOOKUPCACHE", DIRENT_LOOKUP_CACHE_SIZE);
+ m_direntLookup.reset(new DirentLookup(mp_urlDirentAccessor.get(), cacheSize));
+ });
+ return *m_direntLookup;
+ }
+
+ void FileImpl::quickCheckForCorruptFile()
+ {
+ if (!getCountClusters())
+ log_warn("no clusters found");
+ else
+ {
+ offset_t lastOffset = getClusterOffset(cluster_index_t(cluster_index_type(getCountClusters()) - 1));
+ log_debug("last offset=" << lastOffset.v << " file size=" << getFilesize().v);
+ if (lastOffset.v > getFilesize().v)
+ {
+ log_fatal("last offset (" << lastOffset << ") larger than file size (" << getFilesize() << ')');
+ throw ZimFileFormatError("last cluster offset larger than file size; file corrupt");
+ }
+ }
+
+ if (header.hasChecksum() && header.getChecksumPos() != (getFilesize().v-16) ) {
+ throw ZimFileFormatError("Checksum position is not valid");
+ }
+ }
+
+ offset_type FileImpl::getMimeListEndUpperLimit() const
+ {
+ offset_type result(header.getUrlPtrPos());
+ result = std::min(result, header.getTitleIdxPos());
+ result = std::min(result, header.getClusterPtrPos());
+ if ( getCountArticles().v != 0 ) {
+ // assuming that dirents are placed in the zim file in the same
+ // order as the corresponding entries in the dirent pointer table
+ result = std::min(result, mp_urlDirentAccessor->getOffset(entry_index_t(0)).v);
+
+ // assuming that clusters are placed in the zim file in the same
+ // order as the corresponding entries in the cluster pointer table
+ result = std::min(result, readOffset(*clusterOffsetReader, 0).v);
+ }
+ return result;
+ }
+
+ void FileImpl::readMimeTypes()
+ {
+ // read mime types
+ // libzim write zims files two ways :
+ // - The old way by putting the urlPtrPos just after the mimetype.
+ // - The new way by putting the urlPtrPos at the end of the zim files.
+ // In this case, the cluster data are always at 1024 bytes offset and we know that
+ // mimetype list is before this.
+ // 1024 seems to be a good maximum size for the mimetype list, even for the "old" way.
+ const auto endMimeList = getMimeListEndUpperLimit();
+ if ( endMimeList <= header.getMimeListPos() ) {
+ throw(ZimFileFormatError("Bad ZIM archive"));
+ }
+ const zsize_t size(endMimeList - header.getMimeListPos());
+ if ( endMimeList > 1024 ) {
+ log_warn("The MIME-type list is abnormally large (" << size.v << " bytes)");
+ }
+ auto buffer = zimReader->get_buffer(offset_t(header.getMimeListPos()), size);
+ const char* const bufferEnd = buffer.data() + size.v;
+ const char* p = buffer.data();
+ while (*p != '\0') {
+ const char* zp = std::find(p, bufferEnd, '\0');
+
+ if (zp == bufferEnd) {
+ throw(ZimFileFormatError("Error getting mimelists."));
+ }
+
+ std::string mimeType(p, zp);
+ mimeTypes.push_back(mimeType);
+
+ p = zp+1;
+ }
+
+ const_cast<bool&>(m_newNamespaceScheme) = header.getMinorVersion() >= 1;
+ if (m_newNamespaceScheme) {
+ const_cast<entry_index_t&>(m_startUserEntry) = getNamespaceBeginOffset('C');
+ const_cast<entry_index_t&>(m_endUserEntry) = getNamespaceEndOffset('C');
+ } else {
+ const_cast<entry_index_t&>(m_endUserEntry) = getCountArticles();
+ }
+ }
+
+ FileImpl::FindxResult FileImpl::findx(char ns, const std::string& url)
+ {
+ return direntLookup().find(ns, url);
+ }
+
+ FileImpl::FindxResult FileImpl::findx(const std::string& url)
+ {
+ char ns;
+ std::string path;
+ try {
+ std::tie(ns, path) = parseLongPath(url);
+ return findx(ns, path);
+ } catch (...) {}
+ return { false, entry_index_t(0) };
+ }
+
+ static inline int direntCompareTitle(char ns, const std::string& title, const Dirent& dirent)
+ {
+ auto direntNs = dirent.getNamespace();
+ if (ns < direntNs) {
+ return -1;
+ }
+ if (ns > direntNs) {
+ return 1;
+ }
+ return title.compare(dirent.getTitle());
+ }
+
+ FileImpl::FindxTitleResult FileImpl::findxByTitle(char ns, const std::string& title)
+ {
+ log_debug("find article by title " << ns << " \"" << title << "\", in file \"" << getFilename() << '"');
+
+ entry_index_type l = 0;
+ entry_index_type u = entry_index_type(mp_titleDirentAccessor->getDirentCount());
+
+ if (l == u)
+ {
+ log_debug("namespace " << ns << " not found");
+ return { false, title_index_t(0) };
+ }
+
+ unsigned itcount = 0;
+ bool u_is_exact_match = false;
+ while (u - l > 1)
+ {
+ ++itcount;
+ entry_index_type p = l + (u - l) / 2;
+
+ auto d = getDirentByTitle(title_index_t(p));
+ int c = direntCompareTitle(ns, title, *d);
+
+ if (c <= 0) {
+ u = p;
+ u_is_exact_match = (c == 0);
+ } else {
+ l = p;
+ }
+ }
+
+ // We now have a range of 1 where:
+ // - l lower than what we search for (may be upper or equal if it was since the beginning)
+ // - u is upper or equal to what we search for.
+ //
+ // Let's check for l
+ auto d = getDirentByTitle(title_index_t(l));
+ int c = direntCompareTitle(ns, title, *d);
+
+ bool found;
+ entry_index_type ret_index;
+
+ if (c <= 0)
+ {
+ // If l is upper or equal, we have found a match (l), exact or not (c==0 ?)
+ found = (c==0);
+ ret_index = l;
+ } else {
+ // If l is lower, we have either a exact match (u if u is a exact match)
+ // or the upper bound of (virtual) searched range. found = u_is_exact_match;
+ found = u_is_exact_match;
+ ret_index = u;
+ }
+
+ log_debug("article (" << d.getTitle() << ") " << found ? "":"not " << "found after " << itcount << " iterations in file \"" << getFilename() << "\"");
+ return { found, title_index_t(ret_index) };
+ }
+
+ FileCompound::PartRange
+ FileImpl::getFileParts(offset_t offset, zsize_t size)
+ {
+ return zimFile->locate(offset, size);
+ }
+
+ std::shared_ptr<const Dirent> FileImpl::getDirent(entry_index_t idx)
+ {
+ return mp_urlDirentAccessor->getDirent(idx);
+ }
+
+ std::shared_ptr<const Dirent> FileImpl::getDirentByTitle(title_index_t idx)
+ {
+ return mp_titleDirentAccessor->getDirent(idx);
+ }
+
+ entry_index_t FileImpl::getIndexByTitle(title_index_t idx) const
+ {
+ return mp_titleDirentAccessor->getDirectIndex(idx);
+ }
+
+ entry_index_t FileImpl::getFrontEntryCount() const
+ {
+ return entry_index_t(mp_titleDirentAccessor->getDirentCount().v);
+ }
+
+ entry_index_t FileImpl::getIndexByClusterOrder(entry_index_t idx) const
+ {
+ std::call_once(orderOnceFlag, [this]
+ {
+ articleListByCluster.reserve(getUserEntryCount().v);
+
+ auto endIdx = getEndUserEntry().v;
+ for(auto i = getStartUserEntry().v; i < endIdx; i++)
+ {
+ // This is the offset of the dirent in the zimFile
+ auto indexOffset = mp_urlDirentAccessor->getOffset(entry_index_t(i));
+ // Get the mimeType of the dirent (offset 0) to know the type of the dirent
+ uint16_t mimeType = zimReader->read_uint<uint16_t>(indexOffset);
+ if (mimeType==Dirent::redirectMimeType || mimeType==Dirent::linktargetMimeType || mimeType == Dirent::deletedMimeType) {
+ articleListByCluster.push_back(std::make_pair(0, i));
+ } else {
+ // If it is a classic article, get the clusterNumber (at offset 8)
+ auto clusterNumber = zimReader->read_uint<zim::cluster_index_type>(indexOffset+offset_t(8));
+ articleListByCluster.push_back(std::make_pair(clusterNumber, i));
+ }
+ }
+ std::sort(articleListByCluster.begin(), articleListByCluster.end());
+ });
+
+ if (idx.v >= articleListByCluster.size())
+ throw std::out_of_range("entry index out of range");
+ return entry_index_t(articleListByCluster[idx.v].second);
+ }
+
+ FileImpl::ClusterHandle FileImpl::readCluster(cluster_index_t idx)
+ {
+ offset_t clusterOffset(getClusterOffset(idx));
+ log_debug("read cluster " << idx << " from offset " << clusterOffset);
+ return Cluster::read(*zimReader, clusterOffset);
+ }
+
+ std::shared_ptr<const Cluster> FileImpl::getCluster(cluster_index_t idx)
+ {
+ if (idx >= getCountClusters())
+ throw ZimFileFormatError("cluster index out of range");
+
+ auto cluster = clusterCache.getOrPut(idx.v, [=](){ return readCluster(idx); });
+#if ENV32BIT
+ // There was a bug in the way we create the zim files using ZSTD compression.
+ // We were using a too hight compression level and so a window of 128Mb.
+ // So at decompression, zstd reserve a 128Mb buffer.
+ // While this memory is not really used (thanks to lazy allocation of OS),
+ // we are still consumming address space. On 32bits this start to be a rare
+ // ressource when we reserved 128Mb at once.
+ // So we drop the cluster from the cache to avoid future memory allocation error.
+ if (cluster->getCompression() == Compression::Zstd) {
+ // ZSTD compression starts to be used on version 5.0 of zim format.
+ // Recently after, we switch to 5.1 and itegrate the fix in zstd creation.
+ // 5.0 is not a perfect way to detect faulty zim file (it will generate false
+ // positives) but it should be enough.
+ if (header.getMajorVersion() == 5 && header.getMinorVersion() == 0) {
+ clusterCache.drop(idx.v);
+ }
+ }
+#endif
+ return cluster;
+ }
+
+ offset_t FileImpl::getClusterOffset(cluster_index_t idx) const
+ {
+ return readOffset(*clusterOffsetReader, idx.v);
+ }
+
+ offset_t FileImpl::getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx)
+ {
+ auto cluster = getCluster(clusterIdx);
+ if (cluster->isCompressed())
+ return offset_t(0);
+ return getClusterOffset(clusterIdx) + cluster->getBlobOffset(blobIdx);
+ }
+
+ entry_index_t FileImpl::getNamespaceBeginOffset(char ch) const
+ {
+ log_trace("getNamespaceBeginOffset(" << ch << ')');
+ return direntLookup().getNamespaceRangeBegin(ch);
+ }
+
+ entry_index_t FileImpl::getNamespaceEndOffset(char ch) const
+ {
+ log_trace("getNamespaceEndOffset(" << ch << ')');
+ return direntLookup().getNamespaceRangeEnd(ch);
+ }
+
+ const std::string& FileImpl::getMimeType(uint16_t idx) const
+ {
+ if (idx >= mimeTypes.size())
+ {
+ std::ostringstream msg;
+ msg << "unknown mime type code " << idx;
+ throw ZimFileFormatError(msg.str());
+ }
+
+ return mimeTypes[idx];
+ }
+
+ std::string FileImpl::getChecksum()
+ {
+ if (!header.hasChecksum())
+ return std::string();
+
+ try {
+ auto chksum = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16));
+
+ char hexdigest[33];
+ hexdigest[32] = '\0';
+ static const char hex[] = "0123456789abcdef";
+ char* p = hexdigest;
+ for (int i = 0; i < 16; ++i)
+ {
+ uint8_t v = chksum.at(offset_t(i));
+ *p++ = hex[v >> 4];
+ *p++ = hex[v & 0xf];
+ }
+ log_debug("chksum=" << hexdigest);
+ return hexdigest;
+ } catch (...)
+ {
+ log_warn("error reading checksum");
+ return std::string();
+ }
+ }
+
+ bool FileImpl::verify()
+ {
+ if (!header.hasChecksum())
+ return false;
+
+ struct zim_MD5_CTX md5ctx;
+ zim_MD5Init(&md5ctx);
+
+ offset_type checksumPos = header.getChecksumPos();
+ offset_type currentPos = 0;
+ for(auto part = zimFile->begin();
+ part != zimFile->end();
+ part++) {
+ std::ifstream stream(part->second->filename(), std::ios_base::in|std::ios_base::binary);
+
+ char ch;
+ for(/*NOTHING*/ ; currentPos < checksumPos && stream.get(ch).good(); currentPos++) {
+ zim_MD5Update(&md5ctx, reinterpret_cast<const uint8_t*>(&ch), 1);
+ }
+ if (stream.bad()) {
+ perror("error while reading file");
+ return false;
+ }
+ if (currentPos == checksumPos) {
+ break;
+ }
+ }
+
+ if (currentPos != checksumPos) {
+ return false;
+ }
+
+ unsigned char chksumCalc[16];
+ auto chksumFile = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16));
+
+ zim_MD5Final(chksumCalc, &md5ctx);
+ if (std::memcmp(chksumFile.data(), chksumCalc, 16) != 0)
+ {
+ return false;
+ }
+
+ return true;
+ }
+
+ time_t FileImpl::getMTime() const {
+ return zimFile->getMTime();
+ }
+
+ zim::zsize_t FileImpl::getFilesize() const {
+ return zimReader->size();
+ }
+
+ bool FileImpl::is_multiPart() const {
+ return zimFile->is_multiPart();
+ }
+
+ bool FileImpl::checkIntegrity(IntegrityCheck checkType) {
+ switch(checkType) {
+ case IntegrityCheck::CHECKSUM: return FileImpl::checkChecksum();
+ case IntegrityCheck::DIRENT_PTRS: return FileImpl::checkDirentPtrs();
+ case IntegrityCheck::DIRENT_ORDER: return FileImpl::checkDirentOrder();
+ case IntegrityCheck::TITLE_INDEX: return FileImpl::checkTitleIndex();
+ case IntegrityCheck::CLUSTER_PTRS: return FileImpl::checkClusterPtrs();
+ case IntegrityCheck::DIRENT_MIMETYPES: return FileImpl::checkDirentMimeTypes();
+ case IntegrityCheck::COUNT: ASSERT("shouldn't have reached here", ==, "");
+ }
+ return false;
+ }
+
+ bool FileImpl::checkChecksum() {
+ if ( ! verify() ) {
+ std::cerr << "Checksum doesn't match" << std::endl;
+ return false;
+ }
+ return true;
+ }
+
+ bool FileImpl::checkDirentPtrs() {
+ const entry_index_type articleCount = getCountArticles().v;
+ const offset_t validDirentRangeStart(80); // XXX: really???
+ const offset_t validDirentRangeEnd = header.hasChecksum()
+ ? offset_t(header.getChecksumPos())
+ : offset_t(zimReader->size().v);
+ const zsize_t direntMinSize(11);
+ for ( entry_index_type i = 0; i < articleCount; ++i )
+ {
+ const auto offset = mp_urlDirentAccessor->getOffset(entry_index_t(i));
+ if ( offset < validDirentRangeStart ||
+ offset + direntMinSize > validDirentRangeEnd ) {
+ std::cerr << "Invalid dirent pointer" << std::endl;
+ return false;
+ }
+ }
+ return true;
+ }
+
+ bool FileImpl::checkDirentOrder() {
+ const entry_index_type articleCount = getCountArticles().v;
+ std::shared_ptr<const Dirent> prevDirent;
+ for ( entry_index_type i = 0; i < articleCount; ++i )
+ {
+ const std::shared_ptr<const Dirent> dirent = mp_urlDirentAccessor->getDirent(entry_index_t(i));
+ if ( prevDirent && !(prevDirent->getLongUrl() < dirent->getLongUrl()) )
+ {
+ std::cerr << "Dirent table is not properly sorted:\n"
+ << " #" << i-1 << ": " << prevDirent->getLongUrl() << "\n"
+ << " #" << i << ": " << dirent->getLongUrl() << std::endl;
+ return false;
+ }
+ prevDirent = dirent;
+ }
+ return true;
+ }
+
+ bool FileImpl::checkClusterPtrs() {
+ const cluster_index_type clusterCount = getCountClusters().v;
+ const offset_t validClusterRangeStart(80); // XXX: really???
+ const offset_t validClusterRangeEnd = header.hasChecksum()
+ ? offset_t(header.getChecksumPos())
+ : offset_t(zimReader->size().v);
+ const zsize_t clusterMinSize(1); // XXX
+ for ( cluster_index_type i = 0; i < clusterCount; ++i )
+ {
+ const auto offset = readOffset(*clusterOffsetReader, i);
+ if ( offset < validClusterRangeStart ||
+ offset + clusterMinSize > validClusterRangeEnd ) {
+ std::cerr << "Invalid cluster pointer" << std::endl;
+ return false;
+ }
+ }
+ return true;
+ }
+
+namespace
+{
+
+std::string pseudoTitle(const Dirent& d)
+{
+ return std::string(1, d.getNamespace()) + '/' + d.getTitle();
+}
+
+bool checkTitleListing(const IndirectDirentAccessor& accessor, entry_index_type totalCount) {
+ const entry_index_type direntCount = accessor.getDirentCount().v;
+ std::shared_ptr<const Dirent> prevDirent;
+ for ( entry_index_type i = 0; i < direntCount; ++i ) {
+ if (accessor.getDirectIndex(title_index_t(i)).v >= totalCount) {
+ std::cerr << "Invalid title index entry." << std::endl;
+ return false;
+ }
+
+ const std::shared_ptr<const Dirent> dirent = accessor.getDirent(title_index_t(i));
+ if ( prevDirent && !(pseudoTitle(*prevDirent) <= pseudoTitle(*dirent)) ) {
+ std::cerr << "Title index is not properly sorted." << std::endl;
+ return false;
+ }
+ prevDirent = dirent;
+ }
+ return true;
+}
+
+} // unnamed namespace
+
+ bool FileImpl::checkTitleIndex() {
+ const entry_index_type articleCount = getCountArticles().v;
+
+ offset_t titleOffset(header.getTitleIdxPos());
+ zsize_t titleSize(sizeof(entry_index_type)*header.getArticleCount());
+ auto titleDirentAccessor = getTitleAccessor(titleOffset, titleSize, "Full Title index table");
+ auto ret = checkTitleListing(*titleDirentAccessor, articleCount);
+
+ titleDirentAccessor = getTitleAccessor("listing/titleOrdered/v1");
+ if (titleDirentAccessor) {
+ ret &= checkTitleListing(*titleDirentAccessor, articleCount);
+ }
+ return ret;
+ }
+
+ bool FileImpl::checkDirentMimeTypes() {
+ const entry_index_type articleCount = getCountArticles().v;
+ for ( entry_index_type i = 0; i < articleCount; ++i )
+ {
+ const auto dirent = mp_urlDirentAccessor->getDirent(entry_index_t(i));
+ if ( dirent->isArticle() && dirent->getMimeType() >= mimeTypes.size() ) {
+ std::cerr << "Entry " << dirent->getLongUrl()
+ << " has invalid MIME-type value " << dirent->getMimeType()
+ << "." << std::endl;
+ return false;
+ }
+ }
+ return true;
+ }
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020-2021 Veloman Yunkan
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILEIMPL_H
+#define ZIM_FILEIMPL_H
+
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include <zim/zim.h>
+#include <mutex>
+#include "lrucache.h"
+#include "concurrent_cache.h"
+#include "_dirent.h"
+#include "dirent_accessor.h"
+#include "dirent_lookup.h"
+#include "cluster.h"
+#include "buffer.h"
+#include "file_reader.h"
+#include "file_compound.h"
+#include "fileheader.h"
+#include "zim_types.h"
+#include "direntreader.h"
+
+
+namespace zim
+{
+ class FileImpl
+ {
+ std::shared_ptr<FileCompound> zimFile;
+ offset_t archiveStartOffset;
+ std::shared_ptr<Reader> zimReader;
+ std::shared_ptr<DirentReader> direntReader;
+ Fileheader header;
+
+ std::unique_ptr<const Reader> clusterOffsetReader;
+
+ std::shared_ptr<const DirectDirentAccessor> mp_urlDirentAccessor;
+ std::unique_ptr<const IndirectDirentAccessor> mp_titleDirentAccessor;
+
+ typedef std::shared_ptr<const Cluster> ClusterHandle;
+ ConcurrentCache<cluster_index_type, ClusterHandle> clusterCache;
+
+ const bool m_newNamespaceScheme;
+ const bool m_hasFrontArticlesIndex;
+ const entry_index_t m_startUserEntry;
+ const entry_index_t m_endUserEntry;
+
+ typedef std::vector<std::string> MimeTypes;
+ MimeTypes mimeTypes;
+
+ using pair_type = std::pair<cluster_index_type, entry_index_type>;
+ mutable std::vector<pair_type> articleListByCluster;
+ mutable std::once_flag orderOnceFlag;
+
+ using DirentLookup = zim::DirentLookup<DirectDirentAccessor>;
+ mutable std::unique_ptr<DirentLookup> m_direntLookup;
+ mutable std::once_flag m_direntLookupOnceFlag;
+
+ public:
+ using FindxResult = std::pair<bool, entry_index_t>;
+ using FindxTitleResult = std::pair<bool, title_index_t>;
+
+ explicit FileImpl(const std::string& fname);
+#ifndef _WIN32
+ explicit FileImpl(int fd);
+ FileImpl(int fd, offset_t offset, zsize_t size);
+#endif
+
+ offset_t getArchiveStartOffset() const { return archiveStartOffset; }
+ time_t getMTime() const;
+
+ const std::string& getFilename() const { return zimFile->filename(); }
+ const Fileheader& getFileheader() const { return header; }
+ zsize_t getFilesize() const;
+ bool hasNewNamespaceScheme() const { return m_newNamespaceScheme; }
+ bool hasFrontArticlesIndex() const { return m_hasFrontArticlesIndex; }
+
+ FileCompound::PartRange getFileParts(offset_t offset, zsize_t size);
+ std::shared_ptr<const Dirent> getDirent(entry_index_t idx);
+ std::shared_ptr<const Dirent> getDirentByTitle(title_index_t idx);
+ entry_index_t getIndexByTitle(title_index_t idx) const;
+ entry_index_t getIndexByClusterOrder(entry_index_t idx) const;
+ entry_index_t getCountArticles() const { return entry_index_t(header.getArticleCount()); }
+
+ FindxResult findx(char ns, const std::string& url);
+ FindxResult findx(const std::string& url);
+ FindxTitleResult findxByTitle(char ns, const std::string& title);
+
+ std::shared_ptr<const Cluster> getCluster(cluster_index_t idx);
+ cluster_index_t getCountClusters() const { return cluster_index_t(header.getClusterCount()); }
+ offset_t getClusterOffset(cluster_index_t idx) const;
+ offset_t getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx);
+
+ entry_index_t getNamespaceBeginOffset(char ch) const;
+ entry_index_t getNamespaceEndOffset(char ch) const;
+ entry_index_t getNamespaceEntryCount(char ch) const {
+ return getNamespaceEndOffset(ch) - getNamespaceBeginOffset(ch);
+ }
+
+ entry_index_t getStartUserEntry() const { return m_startUserEntry; }
+ entry_index_t getEndUserEntry() const { return m_endUserEntry; }
+ // The number of entries added by the creator. (So excluding index, ...).
+ // On new namespace scheme, number of entries in C namespace
+ entry_index_t getUserEntryCount() const { return m_endUserEntry - m_startUserEntry; }
+ // The number of enties that can be considered as front article (no resource)
+ entry_index_t getFrontEntryCount() const;
+
+ const std::string& getMimeType(uint16_t idx) const;
+
+ std::string getChecksum();
+ bool verify();
+ bool is_multiPart() const;
+
+ bool checkIntegrity(IntegrityCheck checkType);
+ private:
+ explicit FileImpl(std::shared_ptr<FileCompound> zimFile);
+ FileImpl(std::shared_ptr<FileCompound> zimFile, offset_t offset, zsize_t size);
+
+ std::unique_ptr<IndirectDirentAccessor> getTitleAccessor(const std::string& path);
+ std::unique_ptr<IndirectDirentAccessor> getTitleAccessor(const offset_t offset, const zsize_t size, const std::string& name);
+
+ DirentLookup& direntLookup() const;
+ ClusterHandle readCluster(cluster_index_t idx);
+ offset_type getMimeListEndUpperLimit() const;
+ void readMimeTypes();
+ void quickCheckForCorruptFile();
+
+ bool checkChecksum();
+ bool checkDirentPtrs();
+ bool checkDirentOrder();
+ bool checkTitleIndex();
+ bool checkClusterPtrs();
+ bool checkDirentMimeTypes();
+ };
+
+}
+
+#endif // ZIM_FILEIMPL_H
+
--- /dev/null
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FS_H_
+#define ZIM_FS_H_
+
+#ifdef _WIN32
+# include "fs_windows.h"
+#else
+# include "fs_unix.h"
+#endif
+
+namespace zim {
+
+#ifdef _WIN32
+using DEFAULTFS = windows::FS;
+#else
+using DEFAULTFS = unix::FS;
+#endif
+};
+
+#endif //ZIM_FS_H_
--- /dev/null
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "fs_unix.h"
+#include <stdexcept>
+#include <vector>
+#include <sstream>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <errno.h>
+
+namespace zim
+{
+
+namespace unix {
+
+zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const
+{
+#if defined(__APPLE__) || defined(__OpenBSD__) || defined(__FreeBSD__)
+# define PREAD pread
+#else
+# define PREAD pread64
+#endif
+ ssize_t full_size_read = 0;
+ auto size_to_read = size.v;
+ auto current_offset = offset.v;
+ errno = 0;
+ while (size_to_read > 0) {
+ auto size_read = PREAD(m_fd, dest, size_to_read, current_offset);
+ if (size_read == -1) {
+ return zsize_t(-1);
+ }
+ size_to_read -= size_read;
+ current_offset += size_read;
+ full_size_read += size_read;
+ }
+ return zsize_t(full_size_read);
+#undef PREAD
+}
+
+zsize_t FD::getSize() const
+{
+ struct stat sb;
+ fstat(m_fd, &sb);
+ return zsize_t(sb.st_size);
+}
+
+bool FD::seek(offset_t offset)
+{
+ return static_cast<int64_t>(offset.v) == lseek(m_fd, offset.v, SEEK_SET);
+}
+
+bool FD::close() {
+ if (m_fd != -1) {
+ return ::close(m_fd);
+ }
+ return -1;
+}
+
+FD FS::openFile(path_t filepath)
+{
+ int fd = open(filepath.c_str(), O_RDONLY);
+ if (fd == -1) {
+ throw std::runtime_error("");
+ }
+ return FD(fd);
+}
+
+bool FS::makeDirectory(path_t path)
+{
+ return !mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+}
+
+void FS::rename(path_t old_path, path_t new_path)
+{
+ ::rename(old_path.c_str(), new_path.c_str());
+}
+
+std::string FS::join(path_t base, path_t name)
+{
+ return base + "/" + name;
+}
+
+bool FS::remove(path_t path)
+{
+ DIR* dir;
+ /* It's a directory, remove all its entries first */
+ if ((dir = opendir(path.c_str())) != NULL) {
+ struct dirent* ent;
+ while ((ent = readdir(dir)) != NULL) {
+ std::string childName = ent->d_name;
+ if (childName != "." && childName != "..") {
+ auto childPath = join(path, childName);
+ remove(childPath);
+ }
+ }
+ closedir(dir);
+ return removeDir(path);
+ }
+
+ /* It's a file */
+ else {
+ return removeFile(path);
+ }
+}
+
+bool FS::removeDir(path_t path) {
+ return rmdir(path.c_str());
+}
+
+bool FS::removeFile(path_t path) {
+ return ::remove(path.c_str());
+}
+
+
+}; // unix namespace
+
+std::string getFilePathFromFD(int fd)
+{
+ std::ostringstream oss;
+ oss << "/dev/fd/" << fd;
+
+ return oss.str();
+}
+
+}; // zim namespace
+
--- /dev/null
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FS_UNIX_H_
+#define ZIM_FS_UNIX_H_
+
+#include "zim_types.h"
+
+#include <stdexcept>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+
+namespace zim {
+
+namespace unix {
+
+using path_t = const std::string&;
+
+class FD {
+ public:
+ using fd_t = int;
+
+ private:
+ fd_t m_fd = -1;
+
+ public:
+ FD() = default;
+ FD(fd_t fd):
+ m_fd(fd) {};
+ FD(const FD& o) = delete;
+ FD(FD&& o) :
+ m_fd(o.m_fd) { o.m_fd = -1; }
+ FD& operator=(FD&& o) {
+ m_fd = o.m_fd;
+ o.m_fd = -1;
+ return *this;
+ }
+ ~FD() { close(); }
+ zsize_t readAt(char* dest, zsize_t size, offset_t offset) const;
+ zsize_t getSize() const;
+ fd_t getNativeHandle() const
+ {
+ return m_fd;
+ }
+ fd_t release()
+ {
+ int ret = m_fd;
+ m_fd = -1;
+ return ret;
+ }
+ bool seek(offset_t offset);
+ bool close();
+};
+
+struct FS {
+ using FD = zim::unix::FD;
+ static std::string join(path_t base, path_t name);
+ static FD openFile(path_t filepath);
+ static bool makeDirectory(path_t path);
+ static void rename(path_t old_path, path_t new_path);
+ static bool remove(path_t path);
+ static bool removeDir(path_t path);
+ static bool removeFile(path_t path);
+};
+
+}; // unix namespace
+
+std::string getFilePathFromFD(int fd);
+
+}; // zim namespace
+
+#endif //ZIM_FS_UNIX_H_
--- /dev/null
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "fs_windows.h"
+#include <stdexcept>
+
+#include <windows.h>
+#include <winbase.h>
+#include <synchapi.h>
+#include <io.h>
+#include <fileapi.h>
+
+#include <iostream>
+#include <sstream>
+
+namespace zim {
+
+namespace windows {
+
+struct ImplFD {
+ HANDLE m_handle = INVALID_HANDLE_VALUE;
+ CRITICAL_SECTION m_criticalSection;
+
+ ImplFD() {
+ InitializeCriticalSection(&m_criticalSection);
+ }
+ ImplFD(HANDLE handle) :
+ m_handle(handle)
+ {
+ InitializeCriticalSection(&m_criticalSection);
+ }
+
+ ~ImplFD() {
+ DeleteCriticalSection(&m_criticalSection);
+ }
+};
+
+FD::FD() :
+ mp_impl(new ImplFD()) {}
+
+FD::FD(fd_t handle) :
+ mp_impl(new ImplFD(handle)) {}
+
+FD::FD(FD&& o) = default;
+FD& FD::operator=(FD&& o) = default;
+
+FD::~FD()
+{
+ if (mp_impl)
+ close();
+}
+
+zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const
+{
+ if (!mp_impl)
+ return zsize_t(-1);
+ EnterCriticalSection(&mp_impl->m_criticalSection);
+ LARGE_INTEGER off;
+ off.QuadPart = offset.v;
+ if (!SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN)) {
+ goto err;
+ }
+
+ DWORD size_read;
+ if (!ReadFile(mp_impl->m_handle, dest, size.v, &size_read, NULL)) {
+ goto err;
+ }
+ if (size_read != size.v) {
+ goto err;
+ }
+ LeaveCriticalSection(&mp_impl->m_criticalSection);
+ return size;
+err:
+ LeaveCriticalSection(&mp_impl->m_criticalSection);
+ return zsize_t(-1);
+}
+
+bool FD::seek(offset_t offset)
+{
+ if(!mp_impl)
+ return false;
+ LARGE_INTEGER off;
+ off.QuadPart = offset.v;
+ return SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN);
+}
+
+zsize_t FD::getSize() const
+{
+ if(!mp_impl)
+ return zsize_t(0);
+ LARGE_INTEGER size;
+ if (!GetFileSizeEx(mp_impl->m_handle, &size)) {
+ size.QuadPart = 0;
+ }
+ return zsize_t(size.QuadPart);
+}
+
+int FD::release()
+{
+ if(!mp_impl)
+ return -1;
+ int ret = _open_osfhandle(reinterpret_cast<intptr_t>(mp_impl->m_handle), 0);
+ mp_impl->m_handle = INVALID_HANDLE_VALUE;
+ return ret;
+}
+
+bool FD::close()
+{
+ if (!mp_impl || mp_impl->m_handle == INVALID_HANDLE_VALUE) {
+ return false;
+ }
+ return CloseHandle(mp_impl->m_handle);
+}
+
+std::unique_ptr<wchar_t[]> FS::toWideChar(path_t path)
+{
+ auto size = MultiByteToWideChar(CP_UTF8, 0,
+ path.c_str(), -1, nullptr, 0);
+ auto wdata = std::unique_ptr<wchar_t[]>(new wchar_t[size]);
+ auto ret = MultiByteToWideChar(CP_UTF8, 0,
+ path.c_str(), -1, wdata.get(), size);
+ if (0 == ret) {
+ std::ostringstream oss;
+ oss << "Cannot convert path to wchar : " << GetLastError();
+ throw std::runtime_error(oss.str());
+ }
+ return wdata;
+}
+
+FD FS::openFile(path_t filepath)
+{
+ auto wpath = toWideChar(filepath);
+ FD::fd_t handle;
+ handle = CreateFileW(wpath.get(),
+ GENERIC_READ,
+ FILE_SHARE_READ,
+ NULL,
+ OPEN_EXISTING,
+ FILE_ATTRIBUTE_READONLY|FILE_FLAG_RANDOM_ACCESS,
+ NULL);
+ if (handle == INVALID_HANDLE_VALUE) {
+ std::ostringstream oss;
+ oss << "Cannot open file : " << GetLastError();
+ throw std::runtime_error(oss.str());
+ }
+ return FD(handle);
+}
+
+bool FS::makeDirectory(path_t path)
+{
+ auto wpath = toWideChar(path);
+ auto ret = CreateDirectoryW(wpath.get(), NULL);
+ return ret;
+}
+
+
+void FS::rename(path_t old_path, path_t new_path)
+{
+ auto ret = MoveFileExW(toWideChar(old_path).get(), toWideChar(new_path).get(), MOVEFILE_REPLACE_EXISTING|MOVEFILE_WRITE_THROUGH);
+ if (!ret) {
+ std::ostringstream oss;
+ oss << "Cannot move file " << old_path << " to " << new_path;
+ throw std::runtime_error(oss.str());
+ }
+}
+
+std::string FS::join(path_t base, path_t name)
+{
+ return base + "\\" + name;
+}
+
+bool FS::removeDir(path_t path)
+{
+ return RemoveDirectoryW(toWideChar(path).get());
+}
+
+bool FS::removeFile(path_t path)
+{
+ return DeleteFileW(toWideChar(path).get());
+}
+
+}; // windows namespace
+
+}; // zim namespace
+
--- /dev/null
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FS_WINDOWS_H_
+#define ZIM_FS_WINDOWS_H_
+
+#include "zim_types.h"
+
+#include <stdexcept>
+#include <memory>
+
+typedef void* HANDLE;
+
+namespace zim {
+
+namespace windows {
+
+using path_t = const std::string&;
+
+struct ImplFD;
+
+class FD {
+ public:
+ typedef HANDLE fd_t;
+ private:
+ std::unique_ptr<ImplFD> mp_impl;
+
+ public:
+ FD();
+ FD(fd_t handle);
+ FD(const FD& o) = delete;
+ FD(FD&& o);
+ FD& operator=(FD&& o);
+ FD& operator=(const FD& o) = delete;
+ ~FD();
+ zsize_t readAt(char* dest, zsize_t size, offset_t offset) const;
+ zsize_t getSize() const;
+ int release();
+ bool seek(offset_t offset);
+ bool close();
+};
+
+struct FS {
+ using FD = zim::windows::FD;
+ static std::string join(path_t base, path_t name);
+ static std::unique_ptr<wchar_t[]> toWideChar(path_t path);
+ static FD openFile(path_t filepath);
+ static bool makeDirectory(path_t path);
+ static void rename(path_t old_path, path_t new_path);
+ static bool remove(path_t path);
+ static bool removeDir(path_t path);
+ static bool removeFile(path_t path);
+};
+
+}; // windows namespace
+
+}; // zim namespace
+
+#endif //ZIM_FS_WINDOWS_H_
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "istreamreader.h"
+#include "buffer_reader.h"
+
+namespace zim
+{
+
+////////////////////////////////////////////////////////////////////////////////
+// IDataStream
+////////////////////////////////////////////////////////////////////////////////
+
+std::unique_ptr<const Reader>
+IStreamReader::sub_reader(zsize_t size)
+{
+ auto buffer = Buffer::makeBuffer(size);
+ readImpl(const_cast<char*>(buffer.data()), size);
+ return std::unique_ptr<Reader>(new BufferReader(buffer));
+}
+
+} // namespace zim
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_IDATASTREAM_H
+#define ZIM_IDATASTREAM_H
+
+#include <exception>
+#include <memory>
+
+#include "endian_tools.h"
+#include "reader.h"
+
+namespace zim
+{
+
+// IDataStream is a simple interface for sequential iteration over a stream
+// of values of built-in/primitive types and/or opaque binary objects (blobs).
+// An example usage:
+//
+// void foo(IDataStream& s)
+// {
+// const uint32_t n = s.read<uint32_t>();
+// for(uint32_t i=0; i < n; ++i)
+// {
+// const uint16_t blobSize = s.read<uint16_t>();
+// IDataStream::Blob blob = s.readBlob(blobSize);
+// bar(blob, blobSize);
+// }
+// }
+//
+class IStreamReader
+{
+public: // functions
+ virtual ~IStreamReader() = default;
+
+ // Reads a value of the said type from the stream
+ //
+ // For best portability this function should be used with types of known
+ // bit-width (int32_t, uint16_t, etc) rather than builtin types with
+ // unknown bit-width (int, unsigned, etc).
+ template<typename T> T read();
+
+ // Reads a blob of the specified size from the stream
+ virtual std::unique_ptr<const Reader> sub_reader(zsize_t size);
+
+private: // virtual methods
+ // Reads exactly 'nbytes' bytes into the provided buffer 'buf'
+ // (which must be at least that big). Throws an exception if
+ // more bytes are requested than can be retrieved.
+ virtual void readImpl(char* buf, zsize_t nbytes) = 0;
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// Implementation of IDataStream
+////////////////////////////////////////////////////////////////////////////////
+
+// XXX: Assuming that opaque binary data retrieved via 'readImpl()'
+// XXX: is encoded in little-endian form.
+template<typename T>
+inline T
+IStreamReader::read()
+{
+ constexpr size_type N(sizeof(T));
+ char buf[N];
+ readImpl(buf, zsize_t(N));
+ return fromLittleEndian<T>(buf); // XXX: This handles only integral types
+}
+
+} // namespace zim
+
+#endif // ZIM_IDATASTREAM_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Veloman Yunkan
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+#include <zim/item.h>
+#include "_dirent.h"
+#include "cluster.h"
+#include "fileimpl.h"
+#include "file_part.h"
+#include "log.h"
+
+log_define("zim.item")
+
+using namespace zim;
+
+Item::Item(std::shared_ptr<FileImpl> file, entry_index_type idx)
+ : m_file(file),
+ m_idx(idx),
+ m_dirent(file->getDirent(entry_index_t(idx)))
+{}
+
+std::string Item::getTitle() const
+{
+ return m_dirent->getTitle();
+}
+
+std::string Item::getPath() const
+{
+ if (m_file->hasNewNamespaceScheme()) {
+ return m_dirent->getUrl();
+ } else {
+ return m_dirent->getLongUrl();
+ }
+}
+
+std::string Item::getMimetype() const
+{
+ return m_file->getMimeType(m_dirent->getMimeType());
+}
+
+Blob Item::getData(offset_type offset) const
+{
+ auto size = getSize()-offset;
+ return getData(offset, size);
+}
+
+Blob Item::getData(offset_type offset, size_type size) const
+{
+ auto cluster = m_file->getCluster(m_dirent->getClusterNumber());
+ return cluster->getBlob(m_dirent->getBlobNumber(),
+ offset_t(offset),
+ zsize_t(size));
+}
+
+size_type Item::getSize() const
+{
+ auto cluster = m_file->getCluster(m_dirent->getClusterNumber());
+ return size_type(cluster->getBlobSize(m_dirent->getBlobNumber()));
+}
+
+std::pair<std::string, offset_type> Item::getDirectAccessInformation() const
+{
+ auto cluster = m_file->getCluster(m_dirent->getClusterNumber());
+ if (cluster->isCompressed()) {
+ return std::make_pair("", 0);
+ }
+
+ auto full_offset = m_file->getBlobOffset(m_dirent->getClusterNumber(),
+ m_dirent->getBlobNumber());
+
+ full_offset += m_file->getArchiveStartOffset().v;
+
+ auto part_its = m_file->getFileParts(full_offset, zsize_t(getSize()));
+ auto first_part = part_its.first;
+ if (++part_its.first != part_its.second) {
+ // The content is split on two parts. We cannot have direct access
+ return std::make_pair("", 0);
+ }
+ auto range = first_part->first;
+ auto part = first_part->second;
+ const offset_type local_offset(full_offset - range.min);
+ return std::make_pair(part->filename(), local_offset);
+}
+
+cluster_index_type Item::getClusterIndex() const
+{
+ return m_dirent->getClusterNumber().v;
+}
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "config.h"
+
+#ifdef WITH_CXXTOOLS
+
+#include <cxxtools/log.h>
+
+#else
+
+#define log_define(e)
+#define log_fatal(e)
+#define log_error(e)
+#define log_warn(e)
+#define log_info(e)
+#define log_debug(e)
+#define log_trace(e)
+#define log_init()
+
+#endif
--- /dev/null
+/*
+ * Copyrigth (c) 2021, Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (c) 2020, Veloman Yunkan
+ * Copyright (c) 2014, lamerman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the name of lamerman nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * File: lrucache.hpp
+ * Author: Alexander Ponomarev
+ *
+ * Created on June 20, 2013, 5:09 PM
+ */
+
+#ifndef _LRUCACHE_HPP_INCLUDED_
+#define _LRUCACHE_HPP_INCLUDED_
+
+#include <map>
+#include <list>
+#include <cstddef>
+#include <stdexcept>
+#include <cassert>
+
+namespace zim {
+
+template<typename key_t, typename value_t>
+class lru_cache {
+public: // types
+ typedef typename std::pair<key_t, value_t> key_value_pair_t;
+ typedef typename std::list<key_value_pair_t>::iterator list_iterator_t;
+
+ enum AccessStatus {
+ HIT, // key was found in the cache
+ PUT, // key was not in the cache but was created by the getOrPut() access
+ MISS // key was not in the cache; get() access failed
+ };
+
+ class AccessResult
+ {
+ const AccessStatus status_;
+ const value_t val_;
+ public:
+ AccessResult(const value_t& val, AccessStatus status)
+ : status_(status), val_(val)
+ {}
+ AccessResult() : status_(MISS), val_() {}
+
+ bool hit() const { return status_ == HIT; }
+ bool miss() const { return !hit(); }
+ const value_t& value() const
+ {
+ if ( status_ == MISS )
+ throw std::range_error("There is no such key in cache");
+ return val_;
+ }
+
+ operator const value_t& () const { return value(); }
+ };
+
+public: // functions
+ explicit lru_cache(size_t max_size) :
+ _max_size(max_size) {
+ }
+
+ // If 'key' is present in the cache, returns the associated value,
+ // otherwise puts the given value into the cache (and returns it with
+ // a status of a cache miss).
+ AccessResult getOrPut(const key_t& key, const value_t& value) {
+ auto it = _cache_items_map.find(key);
+ if (it != _cache_items_map.end()) {
+ _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second);
+ return AccessResult(it->second->second, HIT);
+ } else {
+ putMissing(key, value);
+ return AccessResult(value, PUT);
+ }
+ }
+
+ void put(const key_t& key, const value_t& value) {
+ auto it = _cache_items_map.find(key);
+ if (it != _cache_items_map.end()) {
+ _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second);
+ it->second->second = value;
+ } else {
+ putMissing(key, value);
+ }
+ }
+
+ AccessResult get(const key_t& key) {
+ auto it = _cache_items_map.find(key);
+ if (it == _cache_items_map.end()) {
+ return AccessResult();
+ } else {
+ _cache_items_list.splice(_cache_items_list.begin(), _cache_items_list, it->second);
+ return AccessResult(it->second->second, HIT);
+ }
+ }
+
+ bool drop(const key_t& key) {
+ try {
+ auto list_it = _cache_items_map.at(key);
+ _cache_items_list.erase(list_it);
+ _cache_items_map.erase(key);
+ return true;
+ } catch (std::out_of_range& e) {
+ return false;
+ }
+ }
+
+ bool exists(const key_t& key) const {
+ return _cache_items_map.find(key) != _cache_items_map.end();
+ }
+
+ size_t size() const {
+ return _cache_items_map.size();
+ }
+
+private: // functions
+ void putMissing(const key_t& key, const value_t& value) {
+ assert(_cache_items_map.find(key) == _cache_items_map.end());
+ _cache_items_list.push_front(key_value_pair_t(key, value));
+ _cache_items_map[key] = _cache_items_list.begin();
+ if (_cache_items_map.size() > _max_size) {
+ _cache_items_map.erase(_cache_items_list.back().first);
+ _cache_items_list.pop_back();
+ }
+ }
+
+private: // data
+ std::list<key_value_pair_t> _cache_items_list;
+ std::map<key_t, list_iterator_t> _cache_items_map;
+ size_t _max_size;
+};
+
+} // namespace zim
+
+#endif /* _LRUCACHE_HPP_INCLUDED_ */
--- /dev/null
+/* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ */
+
+/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+rights reserved.
+
+License to copy and use this software is granted provided that it
+is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+Algorithm" in all material mentioning or referencing this software
+or this function.
+
+License is also granted to make and use derivative works provided
+that such works are identified as "derived from the RSA Data
+Security, Inc. MD5 Message-Digest Algorithm" in all material
+mentioning or referencing the derived work.
+
+RSA Data Security, Inc. makes no representations concerning either
+the merchantability of this software or the suitability of this
+software for any particular purpose. It is provided "as is"
+without express or implied warranty of any kind.
+
+These notices must be retained in any copies of any part of this
+documentation and/or software.
+ */
+
+#include "md5.h"
+#include <string.h>
+
+#define MD5_CTX struct zim_MD5_CTX
+
+/* Constants for MD5Transform routine.
+ */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+static void MD5Transform PROTO_LIST ((UINT4 [4], const unsigned char [64]));
+static void Encode PROTO_LIST
+ ((unsigned char *, UINT4 *, unsigned int));
+static void Decode PROTO_LIST
+ ((UINT4 *, const unsigned char *, unsigned int));
+/*
+static void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int));
+static void MD5_memset PROTO_LIST ((POINTER, int, unsigned int));
+*/
+#define MD5_memcpy memcpy
+#define MD5_memset memset
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions.
+ */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits.
+ */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context.
+ */
+void zim_MD5Init (MD5_CTX* context)
+{
+ context->count[0] = context->count[1] = 0;
+ /* Load magic initialization constants.
+*/
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xefcdab89;
+ context->state[2] = 0x98badcfe;
+ context->state[3] = 0x10325476;
+}
+
+/* MD5 block update operation. Continues an MD5 message-digest
+ operation, processing another message block, and updating the
+ context.
+ */
+void zim_MD5Update (
+MD5_CTX *context,
+const unsigned char *input, /* input block */
+unsigned int inputLen) /* length of input block */
+{
+ unsigned int i, index, partLen;
+
+ /* Compute number of bytes mod 64 */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+ /* Update number of bits */
+ if ((context->count[0] += ((UINT4)inputLen << 3))
+ < ((UINT4)inputLen << 3))
+ context->count[1]++;
+ context->count[1] += ((UINT4)inputLen >> 29);
+
+ partLen = 64 - index;
+
+ /* Transform as many times as possible.
+*/
+ if (inputLen >= partLen) {
+ MD5_memcpy
+ ((POINTER)&context->buffer[index], (POINTER)input, partLen);
+ MD5Transform (context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD5Transform (context->state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ /* Buffer remaining input */
+ MD5_memcpy
+ ((POINTER)&context->buffer[index], (POINTER)&input[i],
+ inputLen-i);
+}
+
+/* MD5 finalization. Ends an MD5 message-digest operation, writing the
+ the message digest and zeroizing the context.
+ */
+void zim_MD5Final (
+unsigned char digest[16], /* message digest */
+MD5_CTX *context) /* context */
+{
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ /* Save number of bits */
+ Encode (bits, context->count, 8);
+
+ /* Pad out to 56 mod 64.
+*/
+ index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ zim_MD5Update (context, PADDING, padLen);
+
+ /* Append length (before padding) */
+ zim_MD5Update (context, bits, 8);
+ /* Store state in digest */
+ Encode (digest, context->state, 16);
+
+ /* Zeroize sensitive information.
+*/
+ MD5_memset ((POINTER)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block.
+ */
+static void MD5Transform (
+UINT4 state[4],
+const unsigned char block[64])
+{
+ UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+ /* Round 2 */
+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+ /* Round 3 */
+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+ /* Round 4 */
+ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ /* Zeroize sensitive information.
+*/
+ MD5_memset ((POINTER)x, 0, sizeof (x));
+}
+
+/* Encodes input (UINT4) into output (unsigned char). Assumes len is
+ a multiple of 4.
+ */
+static void Encode (
+unsigned char *output,
+UINT4 *input,
+unsigned int len)
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+ }
+}
+
+/* Decodes input (unsigned char) into output (UINT4). Assumes len is
+ a multiple of 4.
+ */
+static void Decode (
+UINT4 *output,
+const unsigned char *input,
+unsigned int len)
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
+ (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
+}
+
+#if 0
+/* Note: Replace "for loop" with standard memcpy if possible.
+ */
+
+static void MD5_memcpy (
+POINTER output,
+POINTER input,
+unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; i++)
+ output[i] = input[i];
+}
+
+/* Note: Replace "for loop" with standard memset if possible.
+ */
+static void MD5_memset (
+POINTER output,
+int value,
+unsigned int len)
+{
+ unsigned int i;
+
+ for (i = 0; i < len; i++)
+ ((char *)output)[i] = (char)value;
+}
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2003 Tommi Maekitalo
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * As a special exception, you may use this file as part of a free
+ * software library without restriction. Specifically, if other files
+ * instantiate templates or use macros or inline functions from this
+ * file, or you compile this file and link it with other files to
+ * produce an executable, this file does not by itself cause the
+ * resulting executable to be covered by the GNU General Public
+ * License. This exception does not however invalidate any other
+ * reasons why the executable file might be covered by the GNU Library
+ * General Public License.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+rights reserved.
+
+License to copy and use this software is granted provided that it
+is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+Algorithm" in all material mentioning or referencing this software
+or this function.
+
+License is also granted to make and use derivative works provided
+that such works are identified as "derived from the RSA Data
+Security, Inc. MD5 Message-Digest Algorithm" in all material
+mentioning or referencing the derived work.
+
+RSA Data Security, Inc. makes no representations concerning either
+the merchantability of this software or the suitability of this
+software for any particular purpose. It is provided "as is"
+without express or implied warranty of any kind.
+
+These notices must be retained in any copies of any part of this
+documentation and/or software.
+ */
+
+/* RSAREF types and constants
+ */
+
+/* PROTOTYPES should be set to one if and only if the compiler supports
+ function argument prototyping.
+The following makes PROTOTYPES default to 0 if it has not already
+ been defined with C compiler flags.
+ */
+
+#ifndef ZIM_MD5_H
+#define ZIM_MD5_H
+
+#ifndef PROTOTYPES
+#define PROTOTYPES 1
+#endif
+
+/* POINTER defines a generic pointer type */
+typedef unsigned char *POINTER;
+
+/* UINT2 defines a two byte word */
+typedef unsigned short int UINT2;
+
+/* UINT4 defines a four byte word */
+typedef unsigned int UINT4;
+
+/* PROTO_LIST is defined depending on how PROTOTYPES is defined above.
+ If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it
+ returns an empty list.
+ */
+
+#if PROTOTYPES
+#define PROTO_LIST(list) list
+#else
+#define PROTO_LIST(list) ()
+#endif
+
+/* MD5 context. */
+struct zim_MD5_CTX {
+ UINT4 state[4]; /* state (ABCD) */
+ UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */
+ unsigned char buffer[64]; /* input buffer */
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void zim_MD5Init PROTO_LIST ((struct zim_MD5_CTX *));
+void zim_MD5Update PROTO_LIST
+ ((struct zim_MD5_CTX *, const unsigned char *, unsigned int));
+void zim_MD5Final PROTO_LIST ((unsigned char [16], struct zim_MD5_CTX *));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZIM_MD5_H */
--- /dev/null
+
+configure_file(output : 'config.h',
+ configuration : private_conf,
+ input : 'config.h.in')
+
+src_directory = include_directories('.')
+
+common_sources = [
+# 'config.h',
+ 'archive.cpp',
+ 'cluster.cpp',
+ 'buffer_reader.cpp',
+ 'dirent.cpp',
+ 'dirent_accessor.cpp',
+ 'entry.cpp',
+ 'envvalue.cpp',
+ 'fileheader.cpp',
+ 'fileimpl.cpp',
+ 'file_compound.cpp',
+ 'file_reader.cpp',
+ 'item.cpp',
+ 'blob.cpp',
+ 'buffer.cpp',
+ 'md5.c',
+ 'template.cpp',
+ 'uuid.cpp',
+ 'tools.cpp',
+ 'compression.cpp',
+ 'istreamreader.cpp',
+ 'writer/contentProvider.cpp',
+ 'writer/creator.cpp',
+ 'writer/item.cpp',
+ 'writer/cluster.cpp',
+ 'writer/dirent.cpp',
+ 'writer/workers.cpp',
+ 'writer/clusterWorker.cpp',
+ 'writer/titleListingHandler.cpp',
+ 'writer/counterHandler.cpp',
+ 'suggestion.cpp',
+ 'suggestion_iterator.cpp',
+ 'version.cpp'
+]
+
+if host_machine.system() == 'windows'
+ common_sources += 'fs_windows.cpp'
+else
+ common_sources += 'fs_unix.cpp'
+endif
+
+xapian_sources = [
+ 'search.cpp',
+ 'search_iterator.cpp',
+ 'xapian/htmlparse.cc',
+ 'xapian/myhtmlparse.cc',
+ 'writer/xapianIndexer.cpp',
+ 'writer/xapianWorker.cpp',
+ 'writer/xapianHandler.cpp'
+]
+
+sources = common_sources
+deps = [thread_dep, lzma_dep, zstd_dep]
+
+if target_machine.system() == 'freebsd'
+ deps += [execinfo_dep]
+endif
+
+if xapian_dep.found()
+ sources += xapian_sources
+ sources += lib_resources
+ deps += [xapian_dep, icu_dep]
+endif
+
+libzim = library('zim',
+ sources,
+ include_directories : inc,
+ dependencies : deps,
+ link_args : extra_link_args,
+ cpp_args : extra_cpp_args,
+ version: meson.project_version(),
+ install : true)
+libzim_dep = declare_dependency(link_with: libzim,
+ include_directories: include_directory)
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_NARROWDOWN_H
+#define ZIM_NARROWDOWN_H
+
+#include "zim_types.h"
+#include "debug.h"
+
+#include <algorithm>
+#include <vector>
+
+#include <zim/error.h>
+
+namespace zim
+{
+
+// Given a sorted sequence of items with a string key, NarrowDown helps to
+// narrow down the range in which the query key should belong.
+//
+// The target usage of this class is as a partial in-memory index for a sorted
+// list residing in external storage with high access cost to inidividual items.
+//
+// Illustration:
+//
+// In RAM:
+// key: A I Q Y g o w z
+// item #: | | | | | | | |
+// ----------- | | | | | | | |
+// On disk: V V V V V V V V
+// key: ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
+// data: ajo097124ljp-oasd)(&(*)llkjasdf@$^nFDSs00ujlasdfjkll
+//
+// In such an external list looking up an item by key can be performed via a
+// binary search where on each iteration the item key must be accessed. There
+// are two performance problems with that:
+// 1. The API may not allow accessing only the key of the given item, reading
+// the entire item instead (this is the case with dirents).
+// 2. Access to items (or only their keys) in external storage is expensive.
+//
+// NarrowDown speeds up the look-up operation in such an external list by
+// allowing to split it into two steps:
+// 1. Perform the binary search on the index, yielding a narrower range
+// 2. Perform the binary search on the external list starting from that
+// narrower range.
+//
+// The denser the in-memory index the more the performance improvement.
+// Therefore the implementation focus of NarrowDown is on small memory
+// footprint. If the item keys are long strings with a lot of "garbage" at the
+// end the following trick helps. Suppose that we have the following pair of
+// adjacent keys in our full (external) list:
+//
+// Item # | Key
+// ---------------------------------
+// ... | ...
+// 1234 | "We Are The Champions"
+// 1235 | "We Will Rock You"
+// ... | ...
+//
+// If we were to include the item #1234 in our index the naive approach would
+// be to store its key as is. However, let's imagine that the list also
+// contains an item with key "We W". Then it would have to reside between "We
+// Are The Champions" and "We Will Rock You". So we can pretend that such an
+// item exists and store in our index the fictitious entry {"We W", 1234.5}.
+// When we arrive at that entry during the range narrow-down step we must round
+// the item index downward if it is going to be used as the lower bound of
+// the range, and round it upward if it is going to be used as the upper bound
+// of the range.
+class NarrowDown
+{
+ typedef entry_index_type index_type;
+
+public: // types
+ struct Range
+ {
+ const index_type begin, end;
+ };
+
+public: // functions
+ NarrowDown()
+ : pred(&keyContentArea)
+ {}
+
+ // Add another entry to the search index. The key of the next item is used
+ // to derive and store a shorter pseudo-key as explained in the long comment
+ // above the class.
+ void add(const std::string& key, index_type i, const std::string& nextKey)
+ {
+ // It would be better to have `key >= nextKey`, but pretty old zim file were not enforce to
+ // have unique url, just that entries were sorted by url, but two entries could have the same url.
+ // It is somehow a bug and have been fixed then, but we still have to be tolerent here and accept that
+ // two concecutive keys can be equal.
+ if (key > nextKey) {
+ std::stringstream ss;
+ ss << "Dirent table is not properly sorted:\n";
+ ss << " #" << i << ": " << key[0] << "/" << key.substr(1) << "\n";
+ ss << " #" << i+1 << ": " << nextKey[0] << "/" << nextKey.substr(1);
+ throw ZimFileFormatError(ss.str());
+ }
+ if ( entries.empty() ) {
+ addEntry(key, i);
+ }
+ else
+ {
+ const std::string pseudoKey = shortestStringInBetween(key, nextKey);
+ if (pred(pseudoKey, entries.back())) {
+ std::stringstream ss;
+ ss << "Dirent table is not properly sorted:\n";
+ ss << "PseudoKey " << pseudoKey << " should be after (or equal) previously generated " << pred.getKeyContent(entries.back()) << "\n";
+ throw ZimFileFormatError(ss.str());
+ }
+ ASSERT(entries.back().lindex, <, i);
+ addEntry(pseudoKey, i);
+ }
+ }
+
+ void close(const std::string& key, index_type i)
+ {
+ ASSERT(entries.empty() || pred(entries.back(), key), ==, true);
+ ASSERT(entries.empty() || entries.back().lindex < i, ==, true);
+ addEntry(key, i);
+ }
+
+ Range getRange(const std::string& key) const
+ {
+ auto it = std::upper_bound(entries.begin(), entries.end(), key, pred);
+ if ( it == entries.begin() )
+ return {0, 0};
+
+ const index_type prevEntryLindex = (it-1)->lindex;
+
+ if ( it == entries.end() )
+ return {prevEntryLindex, prevEntryLindex+1};
+
+ return {prevEntryLindex, it->lindex+1};
+ }
+
+ static std::string shortestStringInBetween(const std::string& a, const std::string& b)
+ {
+ ASSERT(a, <=, b);
+
+ // msvc version of `std::mismatch(begin1, end1, begin2)`
+ // need `begin2 + (end1-begin1)` to be valid.
+ // So we cannot simply pass `a.end()` as `end1`.
+ const auto minlen = std::min(a.size(), b.size());
+ const auto m = std::mismatch(a.begin(), a.begin()+minlen, b.begin());
+ return std::string(b.begin(), std::min(b.end(), m.second+1));
+ }
+
+private: // functions
+ void addEntry(const std::string& s, index_type i)
+ {
+ entries.push_back({uint32_t(keyContentArea.size()), i});
+ keyContentArea.insert(keyContentArea.end(), s.begin(), s.end());
+ keyContentArea.push_back('\0');
+ }
+
+private: // types
+ typedef std::vector<char> KeyContentArea;
+
+ struct Entry
+ {
+ // This is mostly a truncated version of a key from the input sequence.
+ // The exceptions are
+ // - the first item
+ // - the last item
+ // - keys that differ from their preceding key only in the last character
+ //
+ // std::string pseudoKey; // std::string has too much memory overhead.
+ uint32_t pseudoKeyOffset; // Instead we densely pack the key contents
+ // into keyContentArea and store in the entry
+ // the offset into that container.
+
+ // This represents the index of the item in the input sequence right
+ // after which pseudoKey might be inserted without breaking the sequence
+ // order. In other words, the condition
+ //
+ // sequence[lindex] <= pseudoKey <= sequence[lindex+1]
+ //
+ // must be true.
+ index_type lindex;
+ };
+
+ struct LookupPred
+ {
+ const KeyContentArea& keyContentArea;
+
+ explicit LookupPred(const KeyContentArea* kca)
+ : keyContentArea(*kca)
+ {}
+
+ const char* getKeyContent(const Entry& entry) const
+ {
+ return &keyContentArea[entry.pseudoKeyOffset];
+ }
+
+ bool operator()(const Entry& entry, const std::string& key) const
+ {
+ return key.compare(getKeyContent(entry)) >= 0;
+ }
+
+ bool operator()(const std::string& key, const Entry& entry) const
+ {
+ return key.compare(getKeyContent(entry)) < 0;
+ }
+ };
+
+ typedef std::vector<Entry> EntryCollection;
+
+private: // data
+ // Used to store the (shortened) keys as densely packed C-style strings
+ KeyContentArea keyContentArea;
+
+ LookupPred pred;
+
+ EntryCollection entries;
+};
+
+} // namespace zim
+
+#endif // ZIM_NARROWDOWN_H
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_RAWSTREAMREADER_H
+#define ZIM_RAWSTREAMREADER_H
+
+#include "istreamreader.h"
+#include "reader.h"
+#include "debug.h"
+
+namespace zim
+{
+
+class RawStreamReader : public IStreamReader
+{
+public: // functions
+ explicit RawStreamReader(std::shared_ptr<const zim::Reader> reader)
+ : m_reader(reader),
+ m_readerPos(0)
+ {}
+
+ void readImpl(char* buf, zsize_t nbytes) override
+ {
+ m_reader->read(buf, m_readerPos, zsize_t(nbytes));
+ m_readerPos += nbytes;
+ }
+
+ std::unique_ptr<const Reader> sub_reader(zsize_t nbytes) override
+ {
+ auto reader = m_reader->sub_reader(m_readerPos, nbytes);
+ m_readerPos += nbytes;
+ return reader;
+ }
+
+
+private: // data
+ std::shared_ptr<const Reader> m_reader;
+ offset_t m_readerPos;
+};
+
+} // namespace zim
+
+#endif // ZIM_READERDATASTREAMWRAPPER_H
--- /dev/null
+/*
+ * Copyright (C) 2017-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_READER_H_
+#define ZIM_READER_H_
+
+#include <memory>
+
+#include "zim_types.h"
+#include "endian_tools.h"
+#include "debug.h"
+
+#include "buffer.h"
+
+namespace zim {
+
+class Reader {
+ public:
+ Reader() {};
+ virtual zsize_t size() const = 0;
+ virtual ~Reader() {};
+
+ virtual void read(char* dest, offset_t offset, zsize_t size) const = 0;
+ template<typename T>
+ T read_uint(offset_t offset) const {
+ ASSERT(offset.v, <, size().v);
+ ASSERT(offset.v+sizeof(T), <=, size().v);
+ char tmp_buf[sizeof(T)];
+ read(tmp_buf, offset, zsize_t(sizeof(T)));
+ return fromLittleEndian<T>(tmp_buf);
+ }
+ virtual char read(offset_t offset) const = 0;
+
+ virtual const Buffer get_buffer(offset_t offset, zsize_t size) const = 0;
+ const Buffer get_buffer(offset_t offset) const {
+ return get_buffer(offset, zsize_t(size().v-offset.v));
+ }
+ virtual std::unique_ptr<const Reader> sub_reader(offset_t offset, zsize_t size) const = 0;
+ std::unique_ptr<const Reader> sub_reader(offset_t offset) const {
+ return sub_reader(offset, zsize_t(size().v-offset.v));
+ }
+ virtual offset_t offset() const = 0;
+
+ bool can_read(offset_t offset, zsize_t size) const;
+};
+
+};
+
+#endif // ZIM_READER_H_
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2021 Veloman Yunkan
+ * Copyright (C) 2020 Emmanuel Engelhart <kelson@kiwix.org>
+ * Copyright (C) 2018 Kunal Mehta <legoktm@member.fsf.org>
+ * Copyright (C) 2007 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/search.h>
+#include <zim/archive.h>
+#include <zim/item.h>
+#include "fileimpl.h"
+#include "search_internal.h"
+#include "fs.h"
+#include "tools.h"
+
+#include <sstream>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#if !defined(_WIN32)
+# include <unistd.h>
+#else
+# include <io.h>
+#endif
+#include <errno.h>
+
+#include "xapian.h"
+#include <unicode/locid.h>
+
+#include "constants.h"
+
+#define MAX_MATCHES_TO_SORT 10000
+
+namespace zim
+{
+
+InternalDataBase::InternalDataBase(const std::vector<Archive>& archives, bool verbose)
+ : m_verbose(verbose)
+{
+ bool first = true;
+ m_queryParser.set_database(m_database);
+ m_queryParser.set_default_op(Xapian::Query::op::OP_AND);
+
+ for(auto& archive: archives) {
+ auto impl = archive.getImpl();
+ FileImpl::FindxResult r;
+ r = impl->findx('X', "fulltext/xapian");
+ if (!r.first) {
+ r = impl->findx('Z', "/fulltextIndex/xapian");
+ }
+ if (!r.first) {
+ continue;
+ }
+ auto xapianEntry = Entry(impl, entry_index_type(r.second));
+ auto accessInfo = xapianEntry.getItem().getDirectAccessInformation();
+ if (accessInfo.second == 0) {
+ continue;
+ }
+
+ Xapian::Database database;
+ if (!getDbFromAccessInfo(accessInfo, database)) {
+ continue;
+ }
+
+ if ( first ) {
+ m_valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
+ auto language = database.get_metadata("language");
+ if (language.empty() ) {
+ // Database created before 2017/03 has no language metadata.
+ // However, term were stemmed anyway and we need to stem our
+ // search query the same the database was created.
+ // So we need a language, let's use the one of the zim.
+ // If zimfile has no language metadata, we can't do lot more here :/
+ try {
+ language = archive.getMetadata("Language");
+ } catch(...) {}
+ }
+ if (!language.empty()) {
+ icu::Locale languageLocale(language.c_str());
+ /* Configuring language base steemming */
+ try {
+ m_stemmer = Xapian::Stem(languageLocale.getLanguage());
+ m_queryParser.set_stemmer(m_stemmer);
+ m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_ALL);
+ } catch (...) {
+ std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl;
+ }
+ }
+ auto stopwords = database.get_metadata("stopwords");
+ if ( !stopwords.empty() ){
+ std::string stopWord;
+ std::istringstream file(stopwords);
+ Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper();
+ while (std::getline(file, stopWord, '\n')) {
+ stopper->add(stopWord);
+ }
+ stopper->release();
+ m_queryParser.set_stopper(stopper);
+ }
+ } else {
+ std::map<std::string, int> valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
+ if (m_valuesmap != valuesmap ) {
+ // [TODO] Ignore the database, raise a error ?
+ }
+ }
+ m_xapianDatabases.push_back(database);
+ m_database.add_database(database);
+ m_archives.push_back(archive);
+ first = false;
+ }
+}
+
+bool InternalDataBase::hasDatabase() const
+{
+ return !m_xapianDatabases.empty();
+}
+
+bool InternalDataBase::hasValuesmap() const
+{
+ return !m_valuesmap.empty();
+}
+
+bool InternalDataBase::hasValue(const std::string& valueName) const
+{
+ return (m_valuesmap.find(valueName) != m_valuesmap.end());
+}
+
+int InternalDataBase::valueSlot(const std::string& valueName) const
+{
+ return m_valuesmap.at(valueName);
+}
+
+Xapian::Query InternalDataBase::parseQuery(const Query& query)
+{
+ Xapian::Query xquery;
+
+ xquery = m_queryParser.parse_query(query.m_query);
+
+ if (query.m_geoquery && hasValue("geo.position")) {
+ Xapian::GreatCircleMetric metric;
+ Xapian::LatLongCoord centre(query.m_latitude, query.m_longitude);
+ Xapian::LatLongDistancePostingSource ps(valueSlot("geo.position"), centre, metric, query.m_distance);
+ Xapian::Query geoQuery(&ps);
+ if (query.m_query.empty()) {
+ xquery = geoQuery;
+ } else {
+ xquery = Xapian::Query(Xapian::Query::OP_FILTER, xquery, geoQuery);
+ }
+ }
+
+ return xquery;
+}
+
+Searcher::Searcher(const std::vector<Archive>& archives) :
+ mp_internalDb(nullptr),
+ m_verbose(false)
+{
+ for ( const auto& a : archives ) {
+ addArchive(a);
+ }
+}
+
+Searcher::Searcher(const Archive& archive) :
+ mp_internalDb(nullptr),
+ m_verbose(false)
+{
+ addArchive(archive);
+}
+
+Searcher::Searcher(const Searcher& other) = default;
+Searcher& Searcher::operator=(const Searcher& other) = default;
+Searcher::Searcher(Searcher&& other) = default;
+Searcher& Searcher::operator=(Searcher&& other) = default;
+Searcher::~Searcher() = default;
+
+namespace
+{
+
+bool archivesAreEquivalent(const Archive& a1, const Archive& a2)
+{
+ return a1.getUuid() == a2.getUuid();
+}
+
+bool contains(const std::vector<Archive>& archives, const Archive& newArchive)
+{
+ for ( const auto& a : archives ) {
+ if ( archivesAreEquivalent(a, newArchive) ) {
+ return true;
+ }
+ }
+ return false;
+}
+
+} // unnamed namespace
+
+Searcher& Searcher::addArchive(const Archive& archive) {
+ if ( !contains(m_archives, archive) ) {
+ m_archives.push_back(archive);
+ mp_internalDb.reset();
+ }
+ return *this;
+}
+
+Search Searcher::search(const Query& query)
+{
+ if (!mp_internalDb) {
+ initDatabase();
+ }
+
+ if (!mp_internalDb->hasDatabase()) {
+ throw(std::runtime_error("Cannot create Search without FT Xapian index"));
+ }
+
+ return Search(mp_internalDb, query);
+}
+
+void Searcher::setVerbose(bool verbose)
+{
+ m_verbose = verbose;
+}
+
+void Searcher::initDatabase()
+{
+ mp_internalDb = std::make_shared<InternalDataBase>(m_archives, m_verbose);
+}
+
+Search::Search(std::shared_ptr<InternalDataBase> p_internalDb, const Query& query)
+ : mp_internalDb(p_internalDb),
+ mp_enquire(nullptr),
+ m_query(query)
+{
+}
+
+Search::Search(Search&& s) = default;
+Search& Search::operator=(Search&& s) = default;
+Search::~Search() = default;
+
+Query::Query(const std::string& query) :
+ m_query(query)
+{}
+
+Query& Query::setQuery(const std::string& query) {
+ m_query = query;
+ return *this;
+}
+
+Query& Query::setGeorange(float latitude, float longitude, float distance) {
+ m_latitude = latitude;
+ m_longitude = longitude;
+ m_distance = distance;
+ m_geoquery = true;
+ return *this;
+}
+
+int Search::getEstimatedMatches() const
+{
+ try {
+ auto enquire = getEnquire();
+ // Force xapian to check at least 10 documents even if we ask for an empty mset.
+ // Else, the get_matches_estimated may be wrong and return 0 even if we have results.
+ auto mset = enquire.get_mset(0, 0, 10);
+ return mset.get_matches_estimated();
+ } catch(Xapian::QueryParserError& e) {
+ return 0;
+ }
+}
+
+const SearchResultSet Search::getResults(int start, int maxResults) const {
+ try {
+ auto enquire = getEnquire();
+ auto mset = enquire.get_mset(start, maxResults);
+ return SearchResultSet(mp_internalDb, std::move(mset));
+ } catch(Xapian::QueryParserError& e) {
+ return SearchResultSet(mp_internalDb);
+ }
+}
+
+Xapian::Enquire& Search::getEnquire() const
+{
+ if ( mp_enquire ) {
+ return *mp_enquire;
+ }
+
+ auto enquire = std::unique_ptr<Xapian::Enquire>(new Xapian::Enquire(mp_internalDb->m_database));
+
+ auto query = mp_internalDb->parseQuery(m_query);
+ if (mp_internalDb->m_verbose) {
+ std::cout << "Parsed query '" << m_query.m_query << "' to " << query.get_description() << std::endl;
+ }
+ enquire->set_query(query);
+
+ mp_enquire = std::move(enquire);
+ return *mp_enquire;
+}
+
+
+SearchResultSet::SearchResultSet(std::shared_ptr<InternalDataBase> p_internalDb, Xapian::MSet&& mset) :
+ mp_internalDb(p_internalDb),
+ mp_mset(std::make_shared<Xapian::MSet>(mset))
+{}
+
+SearchResultSet::SearchResultSet(std::shared_ptr<InternalDataBase> p_internalDb) :
+ mp_internalDb(p_internalDb),
+ mp_mset(nullptr)
+{}
+
+int SearchResultSet::size() const
+{
+ if (! mp_mset) {
+ return 0;
+ }
+ return mp_mset->size();
+}
+
+SearchResultSet::iterator SearchResultSet::begin() const
+{
+ if ( ! mp_mset ) {
+ return nullptr;
+ }
+ return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->begin());
+}
+
+SearchResultSet::iterator SearchResultSet::end() const
+{
+ if ( ! mp_mset ) {
+ return nullptr;
+ }
+ return new SearchIterator::InternalData(mp_internalDb, mp_mset, mp_mset->end());
+}
+
+} //namespace zim
--- /dev/null
+/*
+ * Copyright (C) 2021 Manneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SEARCH_INTERNAL_H
+#define ZIM_SEARCH_INTERNAL_H
+
+#include <xapian.h>
+
+#include <zim/entry.h>
+#include <zim/error.h>
+
+namespace zim {
+
+/**
+ * A class to encapsulate a xapian database and all the information we can gather from it.
+ */
+class InternalDataBase {
+ public: // methods
+ InternalDataBase(const std::vector<Archive>& archives, bool verbose);
+ bool hasDatabase() const;
+ bool hasValuesmap() const;
+ bool hasValue(const std::string& valueName) const;
+ int valueSlot(const std::string& valueName) const;
+
+ Xapian::Query parseQuery(const Query& query);
+
+ public: // data
+ // The (main) database we will search on (wrapping other xapian databases).
+ Xapian::Database m_database;
+
+ // The real databases.
+ std::vector<Xapian::Database> m_xapianDatabases;
+
+ // The archives we are searching on.
+ std::vector<Archive> m_archives;
+
+ // The valuesmap associated with the database.
+ std::map<std::string, int> m_valuesmap;
+
+ // If the database is open for suggestion.
+ // True even if the dabase has no newSuggestionformat.
+ bool m_suggestionMode;
+
+ // The query parser corresponding to the database.
+ Xapian::QueryParser m_queryParser;
+
+ // The stemmer used to parse queries
+ Xapian::Stem m_stemmer;
+
+ // Verbosity of operations.
+ bool m_verbose;
+};
+
+struct SearchIterator::InternalData {
+ std::shared_ptr<InternalDataBase> mp_internalDb;
+ std::shared_ptr<Xapian::MSet> mp_mset;
+ Xapian::MSetIterator iterator;
+ Xapian::Document _document;
+ bool document_fetched;
+ std::unique_ptr<Entry> _entry;
+
+ InternalData(const InternalData& other) :
+ mp_internalDb(other.mp_internalDb),
+ mp_mset(other.mp_mset),
+ iterator(other.iterator),
+ _document(other._document),
+ document_fetched(other.document_fetched),
+ _entry(other._entry ? new Entry(*other._entry) : nullptr )
+ {
+ }
+
+ InternalData& operator=(const InternalData& other)
+ {
+ if (this != &other) {
+ mp_internalDb = other.mp_internalDb;
+ mp_mset = other.mp_mset;
+ iterator = other.iterator;
+ _document = other._document;
+ document_fetched = other.document_fetched;
+ _entry.reset(other._entry ? new Entry(*other._entry) : nullptr);
+ }
+ return *this;
+ }
+
+ InternalData(std::shared_ptr<InternalDataBase> p_internalDb, std::shared_ptr<Xapian::MSet> p_mset, Xapian::MSetIterator iterator) :
+ mp_internalDb(p_internalDb),
+ mp_mset(p_mset),
+ iterator(iterator),
+ document_fetched(false)
+ {};
+
+ Xapian::Document get_document() {
+ if ( !document_fetched ) {
+ if (iterator == mp_mset->end()) {
+ throw std::runtime_error("Cannot get entry for end iterator");
+ }
+ _document = iterator.get_document();
+ document_fetched = true;
+ }
+ return _document;
+ }
+
+ int get_databasenumber() {
+ Xapian::docid docid = *iterator;
+ return (docid - 1) % mp_internalDb->m_archives.size();
+ }
+
+ Entry& get_entry() {
+ if ( !_entry ) {
+ int databasenumber = get_databasenumber();
+ auto archive = mp_internalDb->m_archives.at(databasenumber);
+ _entry.reset(new Entry(archive.getEntryByPath(get_document().get_data())));
+ }
+ return *_entry.get();
+ }
+
+ bool operator==(const InternalData& other) const {
+ return (mp_internalDb == other.mp_internalDb
+ && mp_mset == other.mp_mset
+ && iterator == other.iterator);
+ }
+};
+
+
+
+}; //namespace zim
+
+#endif //ZIM_SEARCH_INTERNAL_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+
+#include "xapian/myhtmlparse.h"
+#include <zim/search_iterator.h>
+#include <zim/search.h>
+#include <zim/archive.h>
+#include <zim/item.h>
+#include "search_internal.h"
+
+namespace zim {
+
+
+SearchIterator::~SearchIterator() = default;
+SearchIterator::SearchIterator(SearchIterator&& it) = default;
+SearchIterator& SearchIterator::operator=(SearchIterator&& it) = default;
+
+SearchIterator::SearchIterator() : SearchIterator(nullptr)
+{};
+
+SearchIterator::SearchIterator(InternalData* internal_data)
+ : internal(internal_data)
+{}
+
+SearchIterator::SearchIterator(const SearchIterator& it)
+ : internal(nullptr)
+{
+ if (it.internal) internal = std::unique_ptr<InternalData>(new InternalData(*it.internal));
+}
+
+SearchIterator & SearchIterator::operator=(const SearchIterator& it) {
+ if ( ! it.internal ) internal.reset();
+ else if ( ! internal ) internal = std::unique_ptr<InternalData>(new InternalData(*it.internal));
+ else *internal = *it.internal;
+
+ return *this;
+}
+
+bool SearchIterator::operator==(const SearchIterator& it) const {
+ if ( ! internal && ! it.internal) {
+ return true;
+ }
+ if ( ! internal || ! it.internal) {
+ return false;
+ }
+ return (*internal == *it.internal);
+}
+
+bool SearchIterator::operator!=(const SearchIterator& it) const {
+ return ! (*this == it);
+}
+
+SearchIterator& SearchIterator::operator++() {
+ if ( ! internal ) {
+ return *this;
+ }
+ ++(internal->iterator);
+ internal->document_fetched = false;
+ internal->_entry.reset();
+ return *this;
+}
+
+SearchIterator SearchIterator::operator++(int) {
+ SearchIterator it = *this;
+ operator++();
+ return it;
+}
+
+SearchIterator& SearchIterator::operator--() {
+ if ( ! internal ) {
+ return *this;
+ }
+ --(internal->iterator);
+ internal->document_fetched = false;
+ internal->_entry.reset();
+ return *this;
+}
+
+SearchIterator SearchIterator::operator--(int) {
+ SearchIterator it = *this;
+ operator--();
+ return it;
+}
+
+std::string SearchIterator::getPath() const {
+ if ( ! internal ) {
+ return "";
+ }
+
+ std::string path = internal->get_document().get_data();
+ bool hasNewNamespaceScheme = internal->mp_internalDb->m_archives.at(getFileIndex()).hasNewNamespaceScheme();
+
+ std::string dbDataType = internal->mp_internalDb->m_database.get_metadata("data");
+ if (dbDataType.empty()) {
+ dbDataType = "fullPath";
+ }
+
+ // If the archive has new namespace scheme and the type of its indexed data
+ // is `fullPath` we return only the `path` without namespace
+ if (hasNewNamespaceScheme && dbDataType == "fullPath") {
+ path = path.substr(2);
+ }
+ return path;
+}
+
+std::string SearchIterator::getDbData() const {
+ if ( ! internal ) {
+ return "";
+ }
+
+ return internal->get_document().get_data();
+}
+
+std::string SearchIterator::getTitle() const {
+ if ( ! internal ) {
+ return "";
+ }
+ return internal->get_entry().getTitle();
+}
+
+int SearchIterator::getScore() const {
+ if ( ! internal ) {
+ return 0;
+ }
+ return internal->iterator.get_percent();
+}
+
+std::string SearchIterator::getSnippet() const {
+ if ( ! internal ) {
+ return "";
+ }
+
+ // Generate full text snippet
+ if ( ! internal->mp_internalDb->hasValuesmap() )
+ {
+ /* This is the old legacy version. Guess and try */
+ std::string stored_snippet = internal->get_document().get_value(1);
+ if ( ! stored_snippet.empty() )
+ return stored_snippet;
+ /* Let's continue here, and see if we can genenate one */
+ }
+ else if ( internal->mp_internalDb->hasValue("snippet") )
+ {
+ return internal->get_document().get_value(internal->mp_internalDb->valueSlot("snippet"));
+ }
+ /* No reader, no snippet */
+ try {
+ Entry& entry = internal->get_entry();
+ /* Get the content of the item to generate a snippet.
+ We parse it and use the html dump to avoid remove html tags in the
+ content and be able to nicely cut the text at random place. */
+ zim::MyHtmlParser htmlParser;
+ std::string content = entry.getItem().getData();
+ try {
+ htmlParser.parse_html(content, "UTF-8", true);
+ } catch (...) {}
+ return internal->mp_mset->snippet(htmlParser.dump, 500, internal->mp_internalDb->m_stemmer);
+ } catch (...) {
+ return "";
+ }
+}
+
+int SearchIterator::getSize() const {
+ if ( ! internal ) {
+ return -1;
+ }
+ if ( ! internal->mp_internalDb->hasValuesmap() )
+ {
+ /* This is the old legacy version. Guess and try */
+ return internal->get_document().get_value(2).empty() == true ? -1 : atoi(internal->get_document().get_value(2).c_str());
+ }
+ else if ( internal->mp_internalDb->hasValue("size") )
+ {
+ return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("size")).c_str());
+ }
+ /* The size is never used. Do we really want to get the content and
+ calculate the size ? */
+ return -1;
+}
+
+int SearchIterator::getWordCount() const {
+ if ( ! internal ) {
+ return -1;
+ }
+ if ( ! internal->mp_internalDb->hasValuesmap() )
+ {
+ /* This is the old legacy version. Guess and try */
+ return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str());
+ }
+ else if ( internal->mp_internalDb->hasValue("wordcount") )
+ {
+ return atoi(internal->get_document().get_value(internal->mp_internalDb->valueSlot("wordcount")).c_str());
+ }
+ return -1;
+}
+
+int SearchIterator::getFileIndex() const {
+ if ( internal ) {
+ return internal->get_databasenumber();
+ }
+ return 0;
+}
+
+Uuid SearchIterator::getZimId() const {
+ if (! internal ) {
+ throw std::runtime_error("Cannot get zimId from uninitialized iterator");
+ }
+ return internal->mp_internalDb->m_archives.at(getFileIndex()).getUuid();
+}
+
+SearchIterator::reference SearchIterator::operator*() const {
+ if (! internal ) {
+ throw std::runtime_error("Cannot get a entry for a uninitialized iterator");
+ }
+ return internal->get_entry();
+}
+
+SearchIterator::pointer SearchIterator::operator->() const {
+ return &**this;
+}
+
+
+} // namespace zim
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+
+#include <zim/suggestion.h>
+#include <zim/item.h>
+#include "suggestion_internal.h"
+#include <iostream>
+#include "fileimpl.h"
+#include "tools.h"
+#include "constants.h"
+
+#if defined(ENABLE_XAPIAN)
+#include <unicode/locid.h>
+#endif // ENABLE_XAPIAN
+
+namespace zim
+{
+
+SuggestionDataBase::SuggestionDataBase(const Archive& archive, bool verbose)
+ : m_archive(archive),
+ m_verbose(verbose)
+{
+// Initialize Xapian DB if it is enabled
+#if defined(ENABLE_XAPIAN)
+ initXapianDb();
+#endif // ENABLE_XAPIAN
+}
+
+#if defined(ENABLE_XAPIAN)
+void SuggestionDataBase::initXapianDb() {
+ m_queryParser.set_database(m_database);
+ m_queryParser.set_default_op(Xapian::Query::op::OP_AND);
+
+ auto impl = m_archive.getImpl();
+ FileImpl::FindxResult r;
+
+ r = impl->findx('X', "title/xapian");
+ if (!r.first) {
+ return;
+ }
+
+ auto xapianEntry = Entry(impl, entry_index_type(r.second));
+ auto accessInfo = xapianEntry.getItem().getDirectAccessInformation();
+ if (accessInfo.second == 0) {
+ return;
+ }
+
+ Xapian::Database database;
+ if (!getDbFromAccessInfo(accessInfo, database)) {
+ return;
+ }
+
+ m_valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
+ auto language = database.get_metadata("language");
+ if (language.empty() ) {
+ // Database created before 2017/03 has no language metadata.
+ // However, term were stemmed anyway and we need to stem our
+ // search query the same the database was created.
+ // So we need a language, let's use the one of the zim.
+ // If zimfile has no language metadata, we can't do lot more here :/
+ try {
+ language = m_archive.getMetadata("Language");
+ } catch(...) {}
+ }
+ if (!language.empty()) {
+ icu::Locale languageLocale(language.c_str());
+ /* Configuring language base steemming */
+ try {
+ m_stemmer = Xapian::Stem(languageLocale.getLanguage());
+ m_queryParser.set_stemmer(m_stemmer);
+ } catch (...) {
+ std::cout << "No stemming for language '" << languageLocale.getLanguage() << "'" << std::endl;
+ }
+ }
+
+ m_database = database;
+}
+
+bool SuggestionDataBase::hasDatabase() const
+{
+ return !m_database.internal.empty();
+}
+
+bool SuggestionDataBase::hasValuesmap() const
+{
+ return !m_valuesmap.empty();
+}
+
+bool SuggestionDataBase::hasValue(const std::string& valueName) const
+{
+ return (m_valuesmap.find(valueName) != m_valuesmap.end());
+}
+
+int SuggestionDataBase::valueSlot(const std::string& valueName) const
+{
+ return m_valuesmap.at(valueName);
+}
+
+/*
+ * subquery_phrase: selects documents that have the terms in the order of the query
+ * within a specified window.
+ * subquery_anchored: selects documents that have the terms in the order of the
+ * query within a specified window and starts from the beginning of the document.
+ * subquery_and: selects documents that have all the terms in the query.
+ *
+ * subquery_phrase and subquery_anchored by themselves are quite exclusive. To
+ * include more "similar" docs, we combine them with subquery_and using OP_OR
+ * operator. If a particular document has a weight of A in subquery_and and B
+ * in subquery_phrase and C in subquery_anchored, the net weight of that document
+ * becomes A+B+C (normalised out of 100). So the documents closer to the query
+ * gets a higher relevance.
+ */
+Xapian::Query SuggestionDataBase::parseQuery(const std::string& query)
+{
+ std::lock_guard<std::mutex> locker(m_mutex);
+ Xapian::Query xquery;
+
+ const auto flags = Xapian::QueryParser::FLAG_DEFAULT | Xapian::QueryParser::FLAG_PARTIAL;
+
+ // Reset stemming strategy for normal parsing
+ m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_SOME);
+ xquery = m_queryParser.parse_query(query, flags);
+
+ if (!query.empty()) {
+ // Reconfigure stemming strategy for phrase search
+ m_queryParser.set_stemming_strategy(Xapian::QueryParser::STEM_NONE);
+
+ Xapian::Query subquery_phrase = m_queryParser.parse_query(query);
+ // Force the OP_PHRASE window to be equal to the number of terms.
+ subquery_phrase = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_phrase.get_terms_begin(), subquery_phrase.get_terms_end(), subquery_phrase.get_length());
+
+ auto qs = ANCHOR_TERM + query;
+ Xapian::Query subquery_anchored = m_queryParser.parse_query(qs);
+ subquery_anchored = Xapian::Query(Xapian::Query::OP_PHRASE, subquery_anchored.get_terms_begin(), subquery_anchored.get_terms_end(), subquery_anchored.get_length());
+
+ xquery = Xapian::Query(Xapian::Query::OP_OR, xquery, subquery_phrase);
+ xquery = Xapian::Query(Xapian::Query::OP_OR, xquery, subquery_anchored);
+ }
+
+ return xquery;
+}
+
+#endif // ENABLE_XAPIAN
+
+SuggestionSearcher::SuggestionSearcher(const Archive& archive) :
+ mp_internalDb(nullptr),
+ m_archive(archive),
+ m_verbose(false)
+{}
+
+SuggestionSearcher::SuggestionSearcher(const SuggestionSearcher& other) = default;
+SuggestionSearcher& SuggestionSearcher::operator=(const SuggestionSearcher& other) = default;
+SuggestionSearcher::SuggestionSearcher(SuggestionSearcher&& other) = default;
+SuggestionSearcher& SuggestionSearcher::operator=(SuggestionSearcher&& other) = default;
+SuggestionSearcher::~SuggestionSearcher() = default;
+
+SuggestionSearch SuggestionSearcher::suggest(const std::string& query)
+{
+ if (!mp_internalDb) {
+ initDatabase();
+ }
+ return SuggestionSearch(mp_internalDb, query);
+}
+
+void SuggestionSearcher::setVerbose(bool verbose)
+{
+ m_verbose = verbose;
+}
+
+void SuggestionSearcher::initDatabase()
+{
+ mp_internalDb = std::make_shared<SuggestionDataBase>(m_archive, m_verbose);
+}
+
+SuggestionSearch::SuggestionSearch(std::shared_ptr<SuggestionDataBase> p_internalDb, const std::string& query)
+ : mp_internalDb(p_internalDb),
+ m_query(query)
+#if defined(ENABLE_XAPIAN)
+ , mp_enquire(nullptr)
+#endif // ENABLE_XAPIAN
+{}
+
+SuggestionSearch::SuggestionSearch(SuggestionSearch&& s) = default;
+SuggestionSearch& SuggestionSearch::operator=(SuggestionSearch&& s) = default;
+SuggestionSearch::~SuggestionSearch() = default;
+
+int SuggestionSearch::getEstimatedMatches() const
+{
+#if defined(ENABLE_XAPIAN)
+ if (mp_internalDb->hasDatabase()) {
+ try {
+ auto enquire = getEnquire();
+ // Force xapian to check at least 10 documents even if we ask for an empty mset.
+ // Else, the get_matches_estimated may be wrong and return 0 even if we have results.
+ auto mset = enquire.get_mset(0, 0, 10);
+ return mset.get_matches_estimated();
+ } catch(...) {
+ std::cerr << "Query Parsing failed, Switching to search without index." << std::endl;
+ }
+ }
+#endif // ENABLE_XAPIAN
+
+ return mp_internalDb->m_archive.findByTitle(m_query).size();
+}
+
+const SuggestionResultSet SuggestionSearch::getResults(int start, int maxResults) const {
+#if defined(ENABLE_XAPIAN)
+ if (mp_internalDb->hasDatabase())
+ {
+ try {
+ auto enquire = getEnquire();
+ auto mset = enquire.get_mset(start, maxResults);
+ return SuggestionResultSet(mp_internalDb, std::move(mset));
+ } catch(...) {
+ std::cerr << "Query Parsing failed, Switching to search without index." << std::endl;
+ }
+ }
+#endif // ENABLE_XAPIAN
+
+ auto entryRange = mp_internalDb->m_archive.findByTitle(m_query);
+ entryRange.offset(start, maxResults);
+ return SuggestionResultSet(entryRange);
+}
+
+const void SuggestionSearch::forceRangeSuggestion() {
+#if defined(ENABLE_XAPIAN)
+ mp_internalDb->m_database.close();
+#endif // ENABLE_XAPIAN
+}
+
+#if defined(ENABLE_XAPIAN)
+Xapian::Enquire& SuggestionSearch::getEnquire() const
+{
+ if ( mp_enquire ) {
+ return *mp_enquire;
+ }
+
+ auto enquire = std::unique_ptr<Xapian::Enquire>(new Xapian::Enquire(mp_internalDb->m_database));
+
+ auto query = mp_internalDb->parseQuery(m_query);
+ if (mp_internalDb->m_verbose) {
+ std::cout << "Parsed query '" << m_query << "' to " << query.get_description() << std::endl;
+ }
+ enquire->set_query(query);
+
+ /*
+ * In suggestion mode, we are searching over a separate title index. Default BM25 is not
+ * adapted for this case. WDF factor(k1) controls the effect of within document frequency.
+ * k1 = 0.001 reduces the effect of word repitition in document. In BM25, smaller documents
+ * get larger weights, so normalising the length of documents is necessary using b = 1.
+ * The document set is first sorted by their relevance score then by value so that suggestion
+ * results are closer to search string.
+ * refer https://xapian.org/docs/apidoc/html/classXapian_1_1BM25Weight.html
+ */
+
+ enquire->set_weighting_scheme(Xapian::BM25Weight(0.001,0,1,1,0.5));
+ if (mp_internalDb->hasValue("title")) {
+ enquire->set_sort_by_relevance_then_value(mp_internalDb->valueSlot("title"), false);
+ }
+
+ if (mp_internalDb->hasValue("targetPath")) {
+ enquire->set_collapse_key(mp_internalDb->valueSlot("targetPath"));
+ }
+
+ mp_enquire = std::move(enquire);
+ return *mp_enquire;
+}
+
+SuggestionResultSet::SuggestionResultSet(std::shared_ptr<SuggestionDataBase> p_internalDb, Xapian::MSet&& mset) :
+ mp_internalDb(p_internalDb),
+ mp_entryRange(nullptr),
+ mp_mset(std::make_shared<Xapian::MSet>(mset))
+{}
+#endif // ENABLE_XAPIAN
+
+SuggestionResultSet::SuggestionResultSet(EntryRange entryRange) :
+ mp_internalDb(nullptr),
+ mp_entryRange(std::unique_ptr<EntryRange>(new EntryRange(entryRange)))
+#if defined(ENABLE_XAPIAN)
+ , mp_mset(nullptr)
+#endif // ENABLE_XAPIAN
+{}
+
+int SuggestionResultSet::size() const
+{
+#if defined(ENABLE_XAPIAN)
+ if (! mp_entryRange) {
+ return mp_mset->size();
+ }
+#endif // ENABLE_XAPIAN
+
+ return mp_entryRange->size();
+}
+
+SuggestionResultSet::iterator SuggestionResultSet::begin() const
+{
+#if defined(ENABLE_XAPIAN)
+ if ( ! mp_entryRange ) {
+ return new iterator::SuggestionInternalData(mp_internalDb, mp_mset, mp_mset->begin());
+ }
+#endif // ENABLE_XAPIAN
+
+ return iterator(mp_entryRange->begin());
+}
+
+SuggestionResultSet::iterator SuggestionResultSet::end() const
+{
+#if defined(ENABLE_XAPIAN)
+ if ( ! mp_entryRange ) {
+ return new iterator::SuggestionInternalData(mp_internalDb, mp_mset, mp_mset->end());
+ }
+#endif // ENABLE_XAPIAN
+
+ return iterator(mp_entryRange->end());
+}
+
+} // namespace zim
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SUGGESTION_INTERNAL_H
+#define ZIM_SUGGESTION_INTERNAL_H
+
+#include "zim/suggestion.h"
+#include "zim/archive.h"
+
+#include <stdexcept>
+#include <mutex>
+
+#if defined(LIBZIM_WITH_XAPIAN)
+#include <xapian.h>
+#endif
+
+namespace zim
+{
+
+/**
+ * A class to encapsulate a xapian title index and it's archive and all the
+ * information we can gather from it.
+ */
+class SuggestionDataBase {
+ public: // methods
+ SuggestionDataBase(const Archive& archive, bool verbose);
+
+ public: // data
+ // The archive to get suggestions from.
+ Archive m_archive;
+
+ // Verbosity of operations.
+ bool m_verbose;
+
+ private: // data
+ std::mutex m_mutex;
+
+#if defined(LIBZIM_WITH_XAPIAN)
+
+ public: // xapian based methods
+ bool hasDatabase() const;
+ bool hasValuesmap() const;
+ bool hasValue(const std::string& valueName) const;
+ int valueSlot(const std::string& valueName) const;
+
+ Xapian::Query parseQuery(const std::string& query);
+
+ public: // xapian based data
+ // The Xapian database we will search on.
+ Xapian::Database m_database;
+
+ // The valuesmap associated with the database.
+ std::map<std::string, int> m_valuesmap;
+
+ // The query parser corresponding to the database.
+ Xapian::QueryParser m_queryParser;
+
+ // The stemmer used to parse queries
+ Xapian::Stem m_stemmer;
+
+ private:
+ void initXapianDb();
+#endif // LIBZIM_WITH_XAPIAN
+};
+
+#if defined(LIBZIM_WITH_XAPIAN)
+struct SuggestionIterator::SuggestionInternalData {
+ std::shared_ptr<SuggestionDataBase> mp_internalDb;
+ std::shared_ptr<Xapian::MSet> mp_mset;
+ Xapian::MSetIterator iterator;
+ Xapian::Document _document;
+ bool document_fetched;
+ std::unique_ptr<Entry> _entry;
+
+ SuggestionInternalData(const SuggestionInternalData& other) :
+ mp_internalDb(other.mp_internalDb),
+ mp_mset(other.mp_mset),
+ iterator(other.iterator),
+ _document(other._document),
+ document_fetched(other.document_fetched),
+ _entry(other._entry ? new Entry(*other._entry) : nullptr )
+ {
+ }
+
+ SuggestionInternalData& operator=(const SuggestionInternalData& other)
+ {
+ if (this != &other) {
+ mp_internalDb = other.mp_internalDb;
+ mp_mset = other.mp_mset;
+ iterator = other.iterator;
+ _document = other._document;
+ document_fetched = other.document_fetched;
+ _entry.reset(other._entry ? new Entry(*other._entry) : nullptr);
+ }
+ return *this;
+ }
+
+ SuggestionInternalData(std::shared_ptr<SuggestionDataBase> p_internalDb, std::shared_ptr<Xapian::MSet> p_mset, Xapian::MSetIterator iterator) :
+ mp_internalDb(p_internalDb),
+ mp_mset(p_mset),
+ iterator(iterator),
+ document_fetched(false)
+ {};
+
+ Xapian::Document get_document() {
+ if ( !document_fetched ) {
+ if (iterator == mp_mset->end()) {
+ throw std::runtime_error("Cannot get entry for end iterator");
+ }
+ _document = iterator.get_document();
+ document_fetched = true;
+ }
+ return _document;
+ }
+
+ Entry& get_entry() {
+ if (!_entry) {
+ _entry.reset(new Entry(mp_internalDb->m_archive.getEntryByPath(get_document().get_data())));
+ }
+ return *_entry.get();
+ }
+
+ bool operator==(const SuggestionInternalData& other) const {
+ return (mp_internalDb == other.mp_internalDb
+ && mp_mset == other.mp_mset
+ && iterator == other.iterator);
+ }
+};
+#endif // LIBZIM_WITH_XAPIAN
+
+}
+
+#endif // ZIM_SUGGESTION_INTERNAL_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+
+#include "zim/suggestion_iterator.h"
+#include "suggestion_internal.h"
+#include <stdexcept>
+
+namespace zim
+{
+
+SuggestionIterator::~SuggestionIterator() = default;
+SuggestionIterator::SuggestionIterator(SuggestionIterator&& it) = default;
+SuggestionIterator& SuggestionIterator::operator=(SuggestionIterator&& it) = default;
+
+SuggestionIterator::SuggestionIterator(RangeIterator rangeIterator)
+ : mp_rangeIterator(std::unique_ptr<RangeIterator>(new RangeIterator(rangeIterator)))
+#if defined(LIBZIM_WITH_XAPIAN)
+ , mp_internal(nullptr)
+#endif // LIBZIM_WITH_XAPIAN
+{}
+
+#if defined(LIBZIM_WITH_XAPIAN)
+SuggestionIterator::SuggestionIterator(SuggestionInternalData* internal)
+ : mp_rangeIterator(nullptr),
+ mp_internal(internal)
+{}
+#endif // LIBZIM_WITH_XAPIAN
+
+SuggestionIterator::SuggestionIterator(const SuggestionIterator& it)
+ : mp_rangeIterator(nullptr)
+{
+#if defined(LIBZIM_WITH_XAPIAN)
+ mp_internal.reset(nullptr);
+ if (it.mp_internal) {
+ mp_internal = std::unique_ptr<SuggestionInternalData>(new SuggestionInternalData(*it.mp_internal));
+ }
+#endif // LIBZIM_WITH_XAPIAN
+
+ if (it.mp_rangeIterator) {
+ mp_rangeIterator = std::unique_ptr<RangeIterator>(new RangeIterator(*it.mp_rangeIterator));
+ }
+}
+
+SuggestionIterator& SuggestionIterator::operator=(const SuggestionIterator& it) {
+ mp_rangeIterator.reset();
+ if (it.mp_rangeIterator) {
+ mp_rangeIterator.reset(new RangeIterator(*it.mp_rangeIterator));
+ }
+
+#if defined(LIBZIM_WITH_XAPIAN)
+ mp_internal.reset();
+ if (it.mp_internal) {
+ mp_internal.reset(new SuggestionInternalData(*it.mp_internal));
+ }
+#endif // LIBZIM_WITH_XAPIAN
+
+ m_suggestionItem.reset();
+ return *this;
+}
+
+bool SuggestionIterator::operator==(const SuggestionIterator& it) const {
+ if (mp_rangeIterator && it.mp_rangeIterator) {
+ return (*mp_rangeIterator == *it.mp_rangeIterator);
+ }
+
+#if defined(LIBZIM_WITH_XAPIAN)
+ if (mp_internal && it.mp_internal) {
+ return (*mp_internal == *it.mp_internal);
+ }
+#endif // LIBZIM_WITH_XAPIAN
+
+ return false;
+}
+
+bool SuggestionIterator::operator!=(const SuggestionIterator& it) const {
+ return ! (*this == it);
+}
+
+SuggestionIterator& SuggestionIterator::operator++() {
+#if defined(LIBZIM_WITH_XAPIAN)
+ if (mp_internal) {
+ ++(mp_internal->iterator);
+ mp_internal->_entry.reset();
+ mp_internal->document_fetched = false;
+ }
+#endif // LIBZIM_WITH_XAPIAN
+
+ if (mp_rangeIterator) {
+ ++(*mp_rangeIterator);
+ }
+ m_suggestionItem.reset();
+ return *this;
+}
+
+SuggestionIterator SuggestionIterator::operator++(int) {
+ SuggestionIterator it = *this;
+ operator++();
+ return it;
+}
+
+SuggestionIterator& SuggestionIterator::operator--() {
+#if defined(LIBZIM_WITH_XAPIAN)
+ if (mp_internal) {
+ --(mp_internal->iterator);
+ mp_internal->_entry.reset();
+ mp_internal->document_fetched = false;
+ }
+#endif // LIBZIM_WITH_XAPIAN
+
+ if (mp_rangeIterator) {
+ --(*mp_rangeIterator);
+ }
+ m_suggestionItem.reset();
+ return *this;
+}
+
+SuggestionIterator SuggestionIterator::operator--(int) {
+ SuggestionIterator it = *this;
+ operator--();
+ return it;
+}
+
+Entry SuggestionIterator::getEntry() const {
+#if defined(LIBZIM_WITH_XAPIAN)
+ if (mp_internal) {
+ return mp_internal->get_entry();
+ }
+#endif // LIBZIM_WITH_XAPIAN
+
+ if (mp_rangeIterator) {
+ return **mp_rangeIterator;
+ }
+ throw std::runtime_error("Cannot dereference iterator");
+}
+
+#if defined(LIBZIM_WITH_XAPIAN)
+std::string SuggestionIterator::getDbData() const {
+ if (! mp_internal) {
+ return "";
+ }
+
+ return mp_internal->get_document().get_data();
+}
+
+std::string SuggestionIterator::getIndexPath() const
+{
+ if (! mp_internal) {
+ return "";
+ }
+
+ std::string path = mp_internal->get_document().get_data();
+ bool hasNewNamespaceScheme = mp_internal->mp_internalDb->m_archive.hasNewNamespaceScheme();
+
+ std::string dbDataType = mp_internal->mp_internalDb->m_database.get_metadata("data");
+ if (dbDataType.empty()) {
+ dbDataType = "fullPath";
+ }
+
+ // If the archive has new namespace scheme and the type of its indexed data
+ // is `fullPath` we return only the `path` without namespace
+ if (hasNewNamespaceScheme && dbDataType == "fullPath") {
+ path = path.substr(2);
+ }
+ return path;
+}
+
+std::string SuggestionIterator::getIndexTitle() const {
+ if ( ! mp_internal) {
+ return "";
+ }
+ try {
+ return mp_internal->get_entry().getTitle();
+ } catch (...) {
+ return "";
+ }
+}
+
+std::string SuggestionIterator::getIndexSnippet() const {
+ if (! mp_internal) {
+ return "";
+ }
+
+ try {
+ return mp_internal->mp_mset->snippet(getIndexTitle(), 500, mp_internal->mp_internalDb->m_stemmer);
+ } catch(...) {
+ return "";
+ }
+}
+#endif // LIBZIM_WITH_XAPIAN
+
+const SuggestionItem& SuggestionIterator::operator*() {
+ if (m_suggestionItem) {
+ return *m_suggestionItem;
+ }
+
+#if defined(LIBZIM_WITH_XAPIAN)
+ if (mp_internal) {
+ m_suggestionItem.reset(new SuggestionItem(getIndexTitle(),
+ getIndexPath(), getIndexSnippet()));
+ } else
+#endif // LIBZIM_WITH_XAPIAN
+
+ if (mp_rangeIterator) {
+ m_suggestionItem.reset(new SuggestionItem((*mp_rangeIterator)->getTitle(),
+ (*mp_rangeIterator)->getPath()));
+ }
+
+ if (!m_suggestionItem){
+ throw std::runtime_error("Cannot dereference iterator");
+ }
+
+ return *m_suggestionItem.get();
+}
+
+const SuggestionItem* SuggestionIterator::operator->() {
+ operator*();
+ return m_suggestionItem.get();
+}
+
+} // namespace zim
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "template.h"
+
+namespace zim
+{
+ void TemplateParser::state_data(char ch)
+ {
+ data += ch;
+
+ if (ch == '<')
+ {
+ state = &TemplateParser::state_lt;
+ save = data.size() - 1;
+ }
+ }
+
+ void TemplateParser::state_lt(char ch)
+ {
+ data += ch;
+
+ if (ch == '%')
+ state = &TemplateParser::state_token0;
+ else
+ state = &TemplateParser::state_data;
+ }
+
+ void TemplateParser::state_token0(char ch)
+ {
+ data += ch;
+
+ if (ch == '/')
+ state = &TemplateParser::state_link0;
+ else
+ {
+ token = data.size() - 1;
+ state = &TemplateParser::state_token;
+ }
+ }
+
+ void TemplateParser::state_token(char ch)
+ {
+ data += ch;
+
+ if (ch == '%')
+ state = &TemplateParser::state_token_end;
+ }
+
+ void TemplateParser::state_token_end(char ch)
+ {
+ if (ch == '>')
+ {
+ if (event)
+ {
+ event->onData(data.substr(0, save));
+ event->onToken(data.substr(token, data.size() - token - 1));
+ data.clear();
+ }
+
+ state = &TemplateParser::state_data;
+ }
+ else
+ {
+ data += ch;
+ state = &TemplateParser::state_data;
+ }
+ }
+
+ void TemplateParser::state_link0(char ch)
+ {
+ data += ch;
+
+ ns = ch;
+ state = &TemplateParser::state_link;
+ }
+
+ void TemplateParser::state_link(char ch)
+ {
+ data += ch;
+
+ if (ch == '/')
+ {
+ token = data.size();
+ state = &TemplateParser::state_title;
+ }
+ else
+ state = &TemplateParser::state_data;
+ }
+
+ void TemplateParser::state_title(char ch)
+ {
+ data += ch;
+
+ if (ch == '%')
+ {
+ token_e = data.size() - 1;
+ state = &TemplateParser::state_title_end;
+ }
+ }
+
+ void TemplateParser::state_title_end(char ch)
+ {
+ data += ch;
+
+ if (ch == '>')
+ {
+ if (event)
+ {
+ event->onData(data.substr(0, save));
+ event->onLink(ns, data.substr(token, token_e - token));
+ }
+
+ data.clear();
+ state = &TemplateParser::state_data;
+ }
+ }
+
+ void TemplateParser::flush()
+ {
+ if (event)
+ event->onData(data);
+ data.clear();
+ state = &TemplateParser::state_data;
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_TEMPLATE_H
+#define ZIM_TEMPLATE_H
+
+#include <string>
+
+namespace zim
+{
+ class TemplateParser
+ {
+ public:
+ class Event
+ {
+ public:
+ virtual void onData(const std::string& data) = 0;
+ virtual void onToken(const std::string& token) = 0;
+ virtual void onLink(char ns, const std::string& url) = 0;
+ virtual ~Event() = default;
+ };
+
+ private:
+ Event* event;
+
+ std::string data;
+ std::string::size_type save;
+ std::string::size_type token;
+ std::string::size_type token_e;
+ char ns;
+ typedef void (TemplateParser::*state_type)(char);
+
+ state_type state;
+
+ void state_data(char ch);
+ void state_lt(char ch);
+ void state_token0(char ch);
+ void state_token(char ch);
+ void state_token_end(char ch);
+ void state_link0(char ch);
+ void state_link(char ch);
+ void state_title(char ch);
+ void state_title_end(char ch);
+
+ public:
+ explicit TemplateParser(Event* ev)
+ : event(ev),
+ state(&TemplateParser::state_data)
+ { }
+
+ void parse(char ch)
+ {
+ (this->*state)(ch);
+ }
+
+ void parse(const std::string& s)
+ {
+ for (std::string::const_iterator ch = s.begin(); ch != s.end(); ++ch)
+ parse(*ch);
+ }
+
+ void flush();
+ };
+}
+
+#endif // ZIM_TEMPLATE_H
--- /dev/null
+/*
+ * Copyright (C) 2016-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Maneeshs P M <manu.pm55@gmail.com>
+ * Copyright (C) 2013-2016 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "tools.h"
+#include "fs.h"
+
+#include <sys/types.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <memory>
+#include <mutex>
+#include <stdexcept>
+#include <random>
+#include <errno.h>
+
+#ifdef _WIN32
+# include <windows.h>
+# include <direct.h>
+# include <io.h>
+# include <stringapiset.h>
+# define SEPARATOR "\\"
+#else
+# include <unistd.h>
+# define SEPARATOR "/"
+#endif
+
+#ifdef __MINGW32__
+# include <time.h>
+#else
+# include <thread>
+# include <chrono>
+#endif
+
+bool zim::isCompressibleMimetype(const std::string& mimetype)
+{
+ return mimetype.find("text") == 0
+ || mimetype.find("+xml") != std::string::npos
+ || mimetype.find("+json") != std::string::npos
+ || mimetype == "application/javascript"
+ || mimetype == "application/json";
+}
+
+uint32_t zim::countWords(const std::string& text)
+{
+ unsigned int numWords = 0;
+ unsigned int length = text.size();
+ unsigned int i = 0;
+
+ // Find first word
+ while ( i < length && std::isspace(text[i]) ) i++;
+
+ while ( i < length ) {
+ // Find end of word
+ while ( i < length && !std::isspace(text[i]) ) i++;
+ numWords++;
+ // Find start of next word
+ while ( i < length && std::isspace(text[i]) ) i++;
+ }
+ return numWords;
+}
+
+
+void zim::microsleep(int microseconds) {
+#ifdef __MINGW32__
+ struct timespec wait = {0, 0};
+ wait.tv_sec = microseconds / 1000000;
+ wait.tv_nsec = (microseconds - wait.tv_sec*10000) * 1000;
+ nanosleep(&wait, nullptr);
+#else
+ std::this_thread::sleep_for(std::chrono::microseconds(microseconds));
+#endif
+}
+
+
+std::tuple<char, std::string> zim::parseLongPath(const std::string& longPath)
+{
+ /* Index of the namespace char; discard '/' from absolute paths */
+ const unsigned int i = (longPath[0] == '/') ? 1 : 0;
+ if (i + 1 > longPath.size() || longPath[i] == '/' || (i + 1 < longPath.size() && longPath[i+1] != '/'))
+ throw std::runtime_error("Cannot parse path");
+
+ auto ns = longPath[i];
+ auto shortPath = longPath.substr(std::min<unsigned int>(i+2, (unsigned int)longPath.size()));
+
+ return std::make_tuple(ns, shortPath);
+}
+
+unsigned int zim::parseIllustrationPathToSize(const std::string& s)
+{
+ int nw(0), nh(0), nEnd(0);
+ long int w(-1), h(-1);
+ if ( sscanf(s.c_str(), "Illustration_%n%ldx%n%ld@1%n)", &nw, &w, &nh, &h, &nEnd) == 2
+ && (size_t)nEnd == s.size() && !isspace(s[nw]) && !isspace(s[nh]) && w == h && w >= 0) {
+ return (unsigned int)w;
+ }
+ throw std::runtime_error("");
+}
+
+uint32_t zim::randomNumber(uint32_t max)
+{
+ static std::default_random_engine random(
+ std::chrono::system_clock::now().time_since_epoch().count());
+ static std::mutex mutex;
+
+ std::lock_guard<std::mutex> l(mutex);
+ return ((double)random() / random.max()) * max;
+}
+
+/* Split string in a token array */
+std::vector<std::string> zim::split(const std::string & str,
+ const std::string & delims)
+{
+ std::string::size_type lastPos = str.find_first_not_of(delims, 0);
+ std::string::size_type pos = str.find_first_of(delims, lastPos);
+ std::vector<std::string> tokens;
+
+ while (std::string::npos != pos || std::string::npos != lastPos)
+ {
+ tokens.push_back(str.substr(lastPos, pos - lastPos));
+ lastPos = str.find_first_not_of(delims, pos);
+ pos = str.find_first_of(delims, lastPos);
+ }
+
+ return tokens;
+}
+
+std::map<std::string, int> zim::read_valuesmap(const std::string &s) {
+ std::map<std::string, int> result;
+ std::vector<std::string> elems = split(s, ";");
+ for(std::vector<std::string>::iterator elem = elems.begin();
+ elem != elems.end();
+ elem++)
+ {
+ std::vector<std::string> tmp_elems = split(*elem, ":");
+ result.insert( std::pair<std::string, int>(tmp_elems[0], atoi(tmp_elems[1].c_str())) );
+ }
+ return result;
+}
+
+// Xapian based tools
+#if defined(ENABLE_XAPIAN)
+
+#include "xapian.h"
+
+#include <unicode/translit.h>
+#include <unicode/ucnv.h>
+std::string zim::removeAccents(const std::string& text)
+{
+ ucnv_setDefaultName("UTF-8");
+ static UErrorCode status = U_ZERO_ERROR;
+ static std::unique_ptr<icu::Transliterator> removeAccentsTrans(icu::Transliterator::createInstance(
+ "Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status));
+ icu::UnicodeString ustring(text.c_str());
+ removeAccentsTrans->transliterate(ustring);
+ std::string unaccentedText;
+ ustring.toUTF8String(unaccentedText);
+ return unaccentedText;
+}
+
+bool zim::getDbFromAccessInfo(zim::Item::DirectAccessInfo accessInfo, Xapian::Database& database) {
+ zim::DEFAULTFS::FD databasefd;
+ try {
+ databasefd = zim::DEFAULTFS::openFile(accessInfo.first);
+ } catch (...) {
+ std::cerr << "Impossible to open " << accessInfo.first << std::endl;
+ std::cerr << strerror(errno) << std::endl;
+ return false;
+ }
+ if (!databasefd.seek(zim::offset_t(accessInfo.second))) {
+ std::cerr << "Something went wrong seeking databasedb "
+ << accessInfo.first << std::endl;
+ std::cerr << "dbOffest = " << accessInfo.second << std::endl;
+ return false;
+ }
+
+ try {
+ database = Xapian::Database(databasefd.release());
+ } catch( Xapian::DatabaseError& e) {
+ std::cerr << "Something went wrong opening xapian database for zimfile "
+ << accessInfo.first << std::endl;
+ std::cerr << "dbOffest = " << accessInfo.second << std::endl;
+ std::cerr << "error = " << e.get_msg() << std::endl;
+ return false;
+ }
+
+ return true;
+}
+#endif
--- /dev/null
+/*
+ * Copyright (C) 2016-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2013-2016 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_TOOLS_H
+#define OPENZIM_LIBZIM_TOOLS_H
+
+#include <string>
+#include <tuple>
+#include <map>
+#include <vector>
+#include "config.h"
+
+#include <zim/item.h>
+
+#if defined(ENABLE_XAPIAN)
+namespace Xapian {
+ class Database;
+}
+#endif // ENABLE_XAPIAN
+namespace zim {
+ bool isCompressibleMimetype(const std::string& mimetype);
+ uint32_t countWords(const std::string& text);
+ void microsleep(int microseconds);
+
+ std::tuple<char, std::string> parseLongPath(const std::string& longPath);
+
+ // Parse a illustration path ("Illustration_<width>x<height>@1") to a size.
+ unsigned int parseIllustrationPathToSize(const std::string& s);
+
+ /** Return a random number from range [0, max]
+ *
+ * This function is threadsafe
+ **/
+ uint32_t randomNumber(uint32_t max);
+
+ std::vector<std::string> split(const std::string & str,
+ const std::string & delims=" *-");
+
+ std::map<std::string, int> read_valuesmap(const std::string& s);
+
+// Xapian based tools
+#if defined(ENABLE_XAPIAN)
+ std::string removeAccents(const std::string& text);
+ bool getDbFromAccessInfo(Item::DirectAccessInfo accessInfo, Xapian::Database& database);
+#endif
+}
+
+#endif // OPENZIM_LIBZIM_TOOLS_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2018-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/uuid.h>
+#include <iostream>
+#include <sstream>
+#include <time.h>
+#include <zim/zim.h> // necessary to have the new types
+#include "log.h"
+#include "md5.h"
+
+#ifdef _WIN32
+
+# include <time.h>
+# include <windows.h>
+int gettimeofday(struct timeval* tp, void* tzp) {
+ DWORD t;
+ t = timeGetTime();
+ tp->tv_sec = t / 1000;
+ tp->tv_usec = t % 1000;
+ return 0;
+}
+
+#define getpid GetCurrentProcessId
+
+#else
+# include <sys/time.h>
+#endif
+
+log_define("zim.uuid")
+
+namespace zim
+{
+ namespace
+ {
+ char hex[] = "0123456789abcdef";
+ inline char hi(char v)
+ { return hex[(v >> 4) & 0xf]; }
+
+ inline char lo(char v)
+ { return hex[v & 0xf]; }
+ }
+
+ Uuid Uuid::generate(std::string value)
+ {
+ Uuid ret;
+ struct zim_MD5_CTX md5ctx;
+ zim_MD5Init(&md5ctx);
+
+ if ( value.empty() ) {
+ struct timeval tv;
+ gettimeofday(&tv, 0);
+
+ clock_t c = clock();
+
+ zim_MD5Update(&md5ctx, reinterpret_cast<const uint8_t*>(&c), sizeof(clock_t));
+ zim_MD5Update(&md5ctx, reinterpret_cast<const uint8_t*>(&tv), sizeof(struct timeval));
+ } else {
+ zim_MD5Update(&md5ctx, reinterpret_cast<const uint8_t*>(value.data()), value.size());
+ }
+ zim_MD5Final(reinterpret_cast<uint8_t*>(&ret.data[0]), &md5ctx);
+
+ log_debug("generated uuid: " << ret.data);
+
+ return ret;
+ }
+
+ Uuid::operator std::string() const
+ {
+ std::ostringstream out;
+ zim::operator<<(out, *this);
+ return out.str();
+ }
+
+ std::ostream& operator<< (std::ostream& out, const Uuid& uuid)
+ {
+ for (unsigned n = 0; n < 4; ++n)
+ out << hi(uuid.data[n]) << lo(uuid.data[n]);
+ out << '-';
+ for (unsigned n = 4; n < 6; ++n)
+ out << hi(uuid.data[n]) << lo(uuid.data[n]);
+ out << '-';
+ for (unsigned n = 6; n < 8; ++n)
+ out << hi(uuid.data[n]) << lo(uuid.data[n]);
+ out << '-';
+ for (unsigned n = 8; n < 10; ++n)
+ out << hi(uuid.data[n]) << lo(uuid.data[n]);
+ out << '-';
+ for (unsigned n = 10; n < 16; ++n)
+ out << hi(uuid.data[n]) << lo(uuid.data[n]);
+ return out;
+ }
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2021 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <iostream>
+#include <sstream>
+
+#include <zim/version.h>
+#include <zim/zim_config.h>
+#include <config.h>
+#include <zstd.h>
+#include <lzma.h>
+
+#if defined(ENABLE_XAPIAN)
+#include <xapian.h>
+#include <unicode/uversion.h>
+#endif
+
+namespace zim
+{
+ LibVersions getVersions() {
+ LibVersions versions = {
+ { "libzim", LIBZIM_VERSION },
+ { "libzstd", ZSTD_VERSION_STRING },
+ { "liblzma", LZMA_VERSION_STRING }
+ };
+
+#if defined(ENABLE_XAPIAN)
+ // Libxapian is not a mandatory dependence
+ versions.push_back({ "libxapian", XAPIAN_VERSION });
+
+ // U_ICU_VERSION does not include the patch level if 0
+ std::ostringstream libicu_version;
+ libicu_version << U_ICU_VERSION_MAJOR_NUM << "." << U_ICU_VERSION_MINOR_NUM << "." << U_ICU_VERSION_PATCHLEVEL_NUM;
+ versions.push_back({ "libicu", libicu_version.str() });
+#endif
+
+ return versions;
+ }
+
+void printVersions(std::ostream& out) {
+ LibVersions versions = getVersions();
+ for (const auto& iter : versions) {
+ out << (iter != versions.front() ? "+ " : "") <<
+ iter.first << " " << iter.second << std::endl;
+ }
+}
+
+} //namespace zim
--- /dev/null
+/*
+ * Copyright (C) 2018-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_DIRENT_H
+#define ZIM_WRITER_DIRENT_H
+
+#include "cluster.h"
+#include "tinyString.h"
+
+#include "debug.h"
+
+namespace zim
+{
+ namespace writer {
+ class Dirent;
+
+ // Be sure that enum value are sorted by "alphabetical" order
+ enum class NS: uint8_t {
+ C = 0,
+ M = 1,
+ W = 2,
+ X = 3
+ };
+
+ char NsAsChar(NS ns);
+
+ class DirentInfo {
+ public: // structures
+ struct Direct {
+ Direct() :
+ cluster(nullptr),
+ blobNumber(0)
+ {};
+ Cluster* cluster;
+ blob_index_t blobNumber;
+ } PACKED;
+
+ struct Redirect {
+ Redirect(NS ns, const std::string& target) :
+ targetPath(target),
+ ns(ns)
+ {};
+ Redirect(Redirect&& r) = default;
+ ~Redirect() {};
+ TinyString targetPath;
+ NS ns;
+ } PACKED;
+
+ struct Resolved {
+ Resolved(const Dirent* target) :
+ targetDirent(target)
+ {};
+ const Dirent* targetDirent;
+ } PACKED;
+
+ public: // functions
+ ~DirentInfo() {
+ switch(tag) {
+ case DIRECT:
+ direct.~Direct();
+ break;
+ case REDIRECT:
+ redirect.~Redirect();
+ break;
+ case RESOLVED:
+ resolved.~Resolved();
+ break;
+ }
+ };
+ DirentInfo(Direct&& d):
+ direct(std::move(d)),
+ tag(DirentInfo::DIRECT)
+ {}
+ DirentInfo(Redirect&& r):
+ redirect(std::move(r)),
+ tag(DirentInfo::REDIRECT)
+ {}
+ DirentInfo(Resolved&& r):
+ resolved(std::move(r)),
+ tag(DirentInfo::RESOLVED)
+ {}
+ DirentInfo::Direct& getDirect() {
+ ASSERT(tag, ==, DIRECT);
+ return direct;
+ }
+ DirentInfo::Redirect& getRedirect() {
+ ASSERT(tag, ==, REDIRECT);
+ return redirect;
+ }
+ DirentInfo::Resolved& getResolved() {
+ ASSERT(tag, ==, RESOLVED);
+ return resolved;
+ }
+ const DirentInfo::Direct& getDirect() const {
+ ASSERT(tag, ==, DIRECT);
+ return direct;
+ }
+ const DirentInfo::Redirect& getRedirect() const {
+ ASSERT(tag, ==, REDIRECT);
+ return redirect;
+ }
+ const DirentInfo::Resolved& getResolved() const {
+ ASSERT(tag, ==, RESOLVED);
+ return resolved;
+ }
+
+ private: // members
+ union {
+ Direct direct;
+ Redirect redirect;
+ Resolved resolved;
+ } PACKED;
+
+ public: // members
+ enum : char {DIRECT, REDIRECT, RESOLVED} tag;
+ } PACKED;
+
+ class Dirent
+ {
+ static const uint16_t redirectMimeType = 0xffff;
+ static const uint32_t version = 0;
+
+ PathTitleTinyString pathTitle;
+ uint16_t mimeType;
+ entry_index_t idx = entry_index_t(0);
+ DirentInfo info;
+ offset_t offset;
+ uint8_t _ns : 2;
+ bool removed : 1;
+ bool frontArticle : 1;
+
+ public:
+ // Creator for a "classic" dirent
+ Dirent(NS ns, const std::string& path, const std::string& title, uint16_t mimetype);
+
+ // Creator for a "redirection" dirent
+ Dirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath);
+
+ // Creator for "temporary" dirent, used to search for dirent in container.
+ // We use them in url ordered container so we only need to set the namespace and the path.
+ // Other value are irrelevant.
+ Dirent(NS ns, const std::string& path)
+ : Dirent(ns, path, "", 0)
+ { }
+
+ NS getNamespace() const { return static_cast<NS>(_ns); }
+ std::string getTitle() const { return pathTitle.getTitle(false); }
+ std::string getRealTitle() const { return pathTitle.getTitle(true); }
+ std::string getPath() const { return pathTitle.getPath(); }
+
+ uint32_t getVersion() const { return version; }
+
+ NS getRedirectNs() const;
+ std::string getRedirectPath() const;
+ void setRedirect(const Dirent* target) {
+ ASSERT(info.tag, ==, DirentInfo::REDIRECT);
+ info.~DirentInfo();
+ new(&info) DirentInfo(DirentInfo::Resolved(target));
+ }
+ entry_index_t getRedirectIndex() const {
+ return info.getResolved().targetDirent->getIdx();
+ }
+
+ void setIdx(entry_index_t idx_) { idx = idx_; }
+ entry_index_t getIdx() const { return idx; }
+
+
+ void setCluster(zim::writer::Cluster* _cluster)
+ {
+ auto& direct = info.getDirect();
+ direct.cluster = _cluster;
+ direct.blobNumber = _cluster->count();
+ }
+
+ zim::writer::Cluster* getCluster()
+ {
+ return info.getDirect().cluster;
+ }
+
+ cluster_index_t getClusterNumber() const {
+ auto& direct = info.getDirect();
+ return direct.cluster ? direct.cluster->getClusterIndex() : cluster_index_t(0);
+ }
+ blob_index_t getBlobNumber() const {
+ return info.getDirect().blobNumber;
+ }
+
+ bool isRedirect() const { return mimeType == redirectMimeType; }
+ bool isItem() const { return !isRedirect(); }
+ uint16_t getMimeType() const { return mimeType; }
+ void setMimeType(uint16_t m) {
+ ASSERT(info.tag, ==, DirentInfo::DIRECT);
+ mimeType = m;
+ }
+ size_t getDirentSize() const
+ {
+ return (isRedirect() ? 12 : 16) + pathTitle.size() + 1;
+ }
+
+ offset_t getOffset() const { return offset; }
+ void setOffset(offset_t o) { offset = o; }
+
+ bool isRemoved() const { return removed; }
+ void markRemoved() { removed = true; }
+
+ bool isFrontArticle() const { return frontArticle; }
+ void setFrontArticle() { frontArticle = true; }
+
+ void write(int out_fd) const;
+
+ friend bool compareUrl(const Dirent* d1, const Dirent* d2);
+ friend inline bool compareTitle(const Dirent* d1, const Dirent* d2);
+ } PACKED;
+
+
+ inline bool compareUrl(const Dirent* d1, const Dirent* d2)
+ {
+ return d1->getNamespace() < d2->getNamespace()
+ || (d1->getNamespace() == d2->getNamespace() && d1->getPath() < d2->getPath());
+ }
+ inline bool compareTitle(const Dirent* d1, const Dirent* d2)
+ {
+ return d1->getNamespace() < d2->getNamespace()
+ || (d1->getNamespace() == d2->getNamespace() && d1->getTitle() < d2->getTitle());
+ }
+ }
+}
+
+#endif // ZIM_WRITER_DIRENT_H
+
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Veloman Yunkan
+ * Copyright (C) 2020 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "cluster.h"
+#include "../log.h"
+#include "../endian_tools.h"
+#include "../debug.h"
+#include "../compression.h"
+
+#include <zim/writer/contentProvider.h>
+
+#include <sstream>
+#include <fstream>
+
+#include <fcntl.h>
+#include <stdexcept>
+
+#ifdef _WIN32
+# include <io.h>
+#else
+# include <unistd.h>
+# define _write(fd, addr, size) ::write((fd), (addr), (size))
+#endif
+
+const zim::size_type MAX_WRITE_SIZE(4UL*1024*1024*1024-1);
+
+namespace zim {
+namespace writer {
+
+Cluster::Cluster(Compression compression)
+ : compression(compression),
+ isExtended(false),
+ _size(0)
+{
+ blobOffsets.push_back(offset_t(0));
+}
+
+Cluster::~Cluster() {
+ if (compressed_data.data()) {
+ delete[] compressed_data.data();
+ }
+}
+
+void Cluster::clear_data() {
+ clear_raw_data();
+ clear_compressed_data();
+}
+
+void Cluster::clear_raw_data() {
+ Offsets().swap(blobOffsets);
+ ClusterProviders().swap(m_providers);
+}
+
+void Cluster::clear_compressed_data() {
+ if (compressed_data.data()) {
+ delete[] compressed_data.data();
+ compressed_data = Blob();
+ }
+}
+
+void Cluster::close() {
+ if (getCompression() != Compression::None) {
+ // We must compress the content in a buffer.
+ compress();
+ clear_raw_data();
+ }
+ closed = true;
+}
+
+bool Cluster::isClosed() const{
+ return closed;
+}
+
+zsize_t Cluster::size() const
+{
+ if (isClosed()) {
+ throw std::runtime_error("oups");
+ }
+ if (isExtended) {
+ return zsize_t(blobOffsets.size() * sizeof(uint64_t)) + _size;
+ } else {
+ return zsize_t(blobOffsets.size() * sizeof(uint32_t)) + _size;
+ }
+}
+
+template<typename OFFSET_TYPE>
+void Cluster::write_offsets(writer_t writer) const
+{
+ size_type delta = blobOffsets.size() * sizeof(OFFSET_TYPE);
+ char out_buf[sizeof(OFFSET_TYPE)];
+ for (auto offset : blobOffsets)
+ {
+ offset.v += delta;
+ toLittleEndian(static_cast<OFFSET_TYPE>(offset.v), out_buf);
+ writer(Blob(out_buf, sizeof(OFFSET_TYPE)));
+ }
+}
+
+void Cluster::write_content(writer_t writer) const
+{
+ if (isExtended) {
+ write_offsets<uint64_t>(writer);
+ } else {
+ write_offsets<uint32_t>(writer);
+ }
+ write_data(writer);
+}
+
+void Cluster::compress()
+{
+ auto comp = getCompression();
+ switch(comp) {
+ case Compression::Lzma:
+ {
+ _compress<LZMA_INFO>();
+ break;
+ }
+
+ case Compression::Zstd:
+ {
+ _compress<ZSTD_INFO>();
+ break;
+ }
+
+ default:
+ throw std::runtime_error("We cannot compress an uncompressed cluster");
+ };
+}
+
+template<typename COMP_TYPE>
+void Cluster::_compress()
+{
+ Compressor<COMP_TYPE> runner;
+ bool first = true;
+ auto writer = [&](const Blob& data) -> void {
+ if (first) {
+ runner.init((char*)data.data());
+ first = false;
+ }
+ runner.feed(data.data(), data.size());
+ };
+ write_content(writer);
+ zsize_t size;
+ auto comp = runner.get_data(&size);
+ compressed_data = Blob(comp.release(), size.v);
+}
+
+void Cluster::write(int out_fd) const
+{
+ // write clusterInfo
+ char clusterInfo = 0;
+ if (isExtended) {
+ clusterInfo = 0x10;
+ }
+ clusterInfo += static_cast<uint8_t>(getCompression());
+ if (_write(out_fd, &clusterInfo, 1) == -1) {
+ throw std::runtime_error("Error writing");
+ }
+
+ // Open a comprestion stream if needed
+ switch(getCompression())
+ {
+ case Compression::None:
+ {
+ auto writer = [=](const Blob& data) -> void {
+ // Ideally we would simply have to do :
+ // ::write(tmp_fd, data.c_str(), data.size());
+ // However, the data can be pretty big (> 4Gb), especially with test,
+ // And ::write fails to write data > 4Gb. So we have to chunck the write.
+ size_type to_write = data.size();
+ const char* src = data.data();
+ while (to_write) {
+ size_type chunk_size = std::min(MAX_WRITE_SIZE, to_write);
+ auto ret = _write(out_fd, src, chunk_size);
+ src += ret;
+ to_write -= ret;
+ }
+ };
+ write_content(writer);
+ break;
+ }
+
+ case Compression::Lzma:
+ case Compression::Zstd:
+ {
+ log_debug("compress data");
+ if (_write(out_fd, compressed_data.data(), compressed_data.size()) == -1) {
+ throw std::runtime_error("Error writing");
+ }
+ break;
+ }
+
+ default:
+ std::ostringstream msg;
+ msg << "invalid compression flag " << static_cast<uint8_t>(getCompression());
+ log_error(msg.str());
+ throw std::runtime_error(msg.str());
+ }
+}
+
+
+void Cluster::addContent(std::unique_ptr<ContentProvider> provider)
+{
+ auto size = provider->getSize();
+ _size += size;
+ blobOffsets.push_back(offset_t(_size.v));
+ m_count++;
+ isExtended |= (_size.v>UINT32_MAX);
+ if (size == 0)
+ return;
+
+ m_providers.push_back(std::move(provider));
+}
+
+void Cluster::addContent(const std::string& data)
+{
+ auto contentProvider = std::unique_ptr<ContentProvider>(new StringProvider(data));
+ addContent(std::move(contentProvider));
+}
+
+void Cluster::write_data(writer_t writer) const
+{
+ for (auto& provider: m_providers)
+ {
+ ASSERT(provider->getSize(), !=, 0U);
+ zim::size_type size = 0;
+ while(true) {
+ auto blob = provider->feed();
+ if(blob.size() == 0) {
+ break;
+ }
+ size += blob.size();
+ writer(blob);
+ }
+ ASSERT(size, ==, provider->getSize());
+ }
+}
+
+} // writer
+} // zim
--- /dev/null
+/*
+ * Copyright (C) 2017-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_CLUSTER_H_
+#define ZIM_WRITER_CLUSTER_H_
+
+#include <zim/zim.h>
+#include <zim/blob.h>
+#include <iostream>
+#include <vector>
+#include <functional>
+#include <atomic>
+
+#include <zim/writer/item.h>
+#include "../zim_types.h"
+#include "../debug.h"
+
+namespace zim {
+
+namespace writer {
+
+using writer_t = std::function<void(const Blob& data)>;
+class ContentProvider;
+
+class Cluster {
+ typedef std::vector<offset_t> Offsets;
+ typedef std::vector<std::unique_ptr<ContentProvider>> ClusterProviders;
+
+
+ public:
+ Cluster(Compression compression);
+ virtual ~Cluster();
+
+ void setCompression(Compression c) { compression = c; }
+ Compression getCompression() const { return compression; }
+
+ void addContent(std::unique_ptr<ContentProvider> provider);
+ void addContent(const std::string& data);
+
+ blob_index_t count() const { return blob_index_t(m_count); }
+ zsize_t size() const;
+ offset_t getOffset() const { return offset; }
+ void setOffset(offset_t o) { offset = o; }
+ bool is_extended() const { return isExtended; }
+ void clear_data();
+ void close();
+ bool isClosed() const;
+
+ void setClusterIndex(cluster_index_t idx) { index = idx; }
+ cluster_index_t getClusterIndex() const { return index; }
+
+ zsize_t getBlobSize(blob_index_t n) const
+ { return zsize_t(blobOffsets[blob_index_type(n)+1].v - blobOffsets[blob_index_type(n)].v); }
+
+ offset_t getBlobOffset(blob_index_t n) const { return blobOffsets[n.v]; }
+ offset_t getDataOffset() const {
+ ASSERT(bool(closed), ==, true);
+ return offset_t(1) + offset_t((count().v + 1) * (isExtended?sizeof(uint64_t):sizeof(uint32_t)));
+ }
+
+ void write(int out_fd) const;
+
+ protected:
+ Compression compression;
+ cluster_index_t index;
+ bool isExtended;
+ Offsets blobOffsets;
+ offset_t offset;
+ zsize_t _size;
+ ClusterProviders m_providers;
+ mutable Blob compressed_data;
+ std::string tmp_filename;
+ std::atomic<bool> closed { false };
+ blob_index_type m_count { 0 };
+
+ private:
+ void write_content(writer_t writer) const;
+ template<typename OFFSET_TYPE>
+ void write_offsets(writer_t writer) const;
+ void write_data(writer_t writer) const;
+ void compress();
+ template<typename COMP_INFO>
+ void _compress();
+ void clear_raw_data();
+ void clear_compressed_data();
+};
+
+};
+
+};
+
+
+#endif //ZIM_WRITER_CLUSTER_H_
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "clusterWorker.h"
+
+#include "cluster.h"
+
+std::atomic<unsigned long> zim::writer::ClusterTask::waiting_task(0);
+
+namespace zim
+{
+ namespace writer
+ {
+
+ void ClusterTask::run(CreatorData* data) {
+ cluster->close();
+ };
+
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_CLUSTER_WORKER_H
+#define OPENZIM_LIBZIM_CLUSTER_WORKER_H
+
+#include <atomic>
+#include "workers.h"
+
+namespace zim {
+namespace writer {
+
+class Cluster;
+
+class ClusterTask : public Task {
+ public:
+ ClusterTask(const ClusterTask&) = delete;
+ ClusterTask& operator=(const ClusterTask&) = delete;
+ explicit ClusterTask(Cluster* cluster) :
+ cluster(cluster)
+ {
+ ++waiting_task;
+ };
+ virtual ~ClusterTask()
+ {
+ --waiting_task;
+ }
+
+ virtual void run(CreatorData* data);
+ static std::atomic<unsigned long> waiting_task;
+
+ private:
+ Cluster* cluster;
+};
+
+}
+}
+
+#endif // OPENZIM_LIBZIM_QUEUE_H
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/writer/contentProvider.h>
+
+#include "../fs.h"
+
+const zim::size_type BUFFER_SIZE(1024*1024);
+
+namespace zim
+{
+ namespace writer
+ {
+ Blob StringProvider::feed()
+ {
+ if (feeded) {
+ return Blob(nullptr, 0);
+ }
+ feeded = true;
+ return Blob(content.data(), content.size());
+ }
+
+ Blob SharedStringProvider::feed()
+ {
+ if (feeded) {
+ return Blob(nullptr, 0);
+ }
+ feeded = true;
+ return Blob(content->data(), content->size());
+ }
+
+ FileProvider::FileProvider(const std::string& filepath)
+ : filepath(filepath),
+ buffer(new char[BUFFER_SIZE]),
+ fd(new DEFAULTFS::FD(DEFAULTFS::openFile(filepath))),
+ offset(0)
+ {
+ size = fd->getSize().v;
+ }
+
+ FileProvider::~FileProvider() = default;
+
+ Blob FileProvider::feed()
+ {
+ auto sizeToRead = std::min(BUFFER_SIZE, size-offset);
+ if (!sizeToRead) {
+ return Blob(nullptr, 0);
+ }
+
+ if(fd->readAt(buffer.get(), zim::zsize_t(sizeToRead), zim::offset_t(offset)).v == -1UL) {
+ throw std::runtime_error("Error reading file " + filepath);
+ }
+ offset += sizeToRead;
+ return Blob(buffer.get(), sizeToRead);
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "counterHandler.h"
+#include "creatordata.h"
+
+#include <zim/writer/contentProvider.h>
+#include <zim/blob.h>
+
+using namespace zim::writer;
+
+CounterHandler::CounterHandler(CreatorData* data)
+ : mp_creatorData(data)
+{}
+
+CounterHandler::~CounterHandler() = default;
+
+void CounterHandler::start() {
+}
+
+void CounterHandler::stop() {
+}
+
+DirentHandler::Dirents CounterHandler::createDirents() const {
+ Dirents ret;
+ ret.push_back(mp_creatorData->createDirent(NS::M, "Counter", "text/plain", ""));
+ return ret;
+}
+
+DirentHandler::ContentProviders CounterHandler::getContentProviders() const {
+ ContentProviders ret;
+ std::stringstream ss;
+ bool first = true;
+ for(auto pair: m_mimetypeCounter) {
+ if (! first) {
+ ss << ";";
+ }
+ ss << pair.first << "=" << pair.second;
+ first = false;
+ }
+ ret.push_back(std::unique_ptr<ContentProvider>(new StringProvider(ss.str())));
+ return ret;
+}
+
+void CounterHandler::handle(Dirent* dirent, const Hints& hints)
+{
+}
+
+void CounterHandler::handle(Dirent* dirent, std::shared_ptr<Item> item)
+{
+ if (dirent->getNamespace() != NS::C) {
+ return;
+ }
+ auto mimetype = item->getMimeType();
+ if (mimetype.empty()) {
+ return;
+ }
+ m_mimetypeCounter[mimetype] += 1;
+}
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_COUNTER_HANDLER_H
+#define OPENZIM_LIBZIM_COUNTER_HANDLER_H
+
+#include "handler.h"
+
+#include <map>
+
+namespace zim {
+namespace writer {
+
+
+class CounterHandler : public DirentHandler {
+ public:
+ typedef std::map<std::string, entry_index_type> Counter;
+
+ explicit CounterHandler(CreatorData* data);
+ virtual ~CounterHandler();
+
+ void start() override;
+ void stop() override;
+ bool isCompressible() override { return true; }
+ ContentProviders getContentProviders() const override;
+ void handle(Dirent* dirent, std::shared_ptr<Item> item) override;
+ void handle(Dirent* dirent, const Hints& hints) override;
+
+ private:
+ Dirents createDirents() const override;
+ CreatorData* mp_creatorData;
+ Counter m_mimetypeCounter;
+};
+
+}
+}
+
+#endif // OPENZIM_LIBZIM_COUNTER_HANDLER_H
--- /dev/null
+/*
+ * Copyright (C) 2019-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2021 Veloman Yunkan
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/writer/creator.h>
+
+#include "config.h"
+
+#include "creatordata.h"
+#include "cluster.h"
+#include "debug.h"
+#include "workers.h"
+#include "clusterWorker.h"
+#include <zim/blob.h>
+#include <zim/writer/contentProvider.h>
+#include "../endian_tools.h"
+#include <algorithm>
+#include <fstream>
+#include "../md5.h"
+#include "../constants.h"
+#include "counterHandler.h"
+
+#if defined(ENABLE_XAPIAN)
+# include "xapianHandler.h"
+#endif
+
+#ifdef _WIN32
+# include <io.h>
+# include <fcntl.h>
+#else
+# include <unistd.h>
+# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \
+{throw std::runtime_error("Error writing");}
+#endif
+
+#include <sys/stat.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <limits>
+#include <stdexcept>
+#include <sstream>
+#include <ctime>
+#include "log.h"
+#include "../fs.h"
+#include "../tools.h"
+
+log_define("zim.writer.creator")
+
+#define INFO(e) \
+ do { \
+ log_info(e); \
+ std::cout << e << std::endl; \
+ } while(false)
+
+#define TINFO(e) \
+ if (m_verbose) { \
+ double seconds = difftime(time(NULL), data->start_time); \
+ std::cout << "T:" << (int)(seconds) \
+ << "; " << e << std::endl; \
+ }
+
+#define TPROGRESS() \
+ if (m_verbose ) { \
+ double seconds = difftime(time(NULL),data->start_time); \
+ std::cout << "T:" << (int)seconds \
+ << "; A:" << data->dirents.size() \
+ << "; RA:" << data->nbRedirectItems \
+ << "; CA:" << data->nbCompItems \
+ << "; UA:" << data->nbUnCompItems \
+ << "; C:" << data->nbClusters \
+ << "; CC:" << data->nbCompClusters \
+ << "; UC:" << data->nbUnCompClusters \
+ << "; WC:" << data->taskList.size() \
+ << std::endl; \
+ }
+
+
+#define CLUSTER_BASE_OFFSET 1024
+
+namespace zim
+{
+ namespace writer
+ {
+ Creator::Creator()
+ : m_clusterSize(DEFAULT_CLUSTER_SIZE)
+ {}
+ Creator::~Creator() = default;
+
+ Creator& Creator::configVerbose(bool verbose)
+ {
+ m_verbose = verbose;
+ return *this;
+ }
+
+ Creator& Creator::configCompression(Compression compression)
+ {
+ if(compression == Compression::Lzma) {
+ std::cerr << "WARNING: LZMA compression method is deprecated."
+ << " Support for it will be dropped from libzim soon."
+ << std::endl;
+ }
+ m_compression = compression;
+ return *this;
+ }
+
+ Creator& Creator::configClusterSize(zim::size_type targetSize)
+ {
+ m_clusterSize = targetSize;
+ return *this;
+ }
+
+ Creator& Creator::configIndexing(bool indexing, const std::string& language)
+ {
+ m_withIndex = indexing;
+ m_indexingLanguage = language;
+ return *this;
+ }
+
+ Creator& Creator::configNbWorkers(unsigned nbWorkers)
+ {
+ m_nbWorkers = nbWorkers;
+ return *this;
+ }
+
+ void Creator::startZimCreation(const std::string& filepath)
+ {
+ data = std::unique_ptr<CreatorData>(
+ new CreatorData(filepath, m_verbose, m_withIndex, m_indexingLanguage, m_compression, m_clusterSize)
+ );
+
+ for(unsigned i=0; i<m_nbWorkers; i++)
+ {
+ std::thread thread(taskRunner, this->data.get());
+ data->workerThreads.push_back(std::move(thread));
+ }
+
+ data->writerThread = std::thread(clusterWriter, this->data.get());
+ }
+
+ void Creator::addItem(std::shared_ptr<Item> item)
+ {
+ bool compressContent = item->getAmendedHints()[COMPRESS];
+ auto dirent = data->createItemDirent(item.get());
+ data->addItemData(dirent, item->getContentProvider(), compressContent);
+ data->handle(dirent, item);
+
+ if (data->dirents.size()%1000 == 0) {
+ TPROGRESS();
+ }
+
+ }
+
+ void Creator::addMetadata(const std::string& name, const std::string& content, const std::string& mimetype)
+ {
+ auto provider = std::unique_ptr<ContentProvider>(new StringProvider(content));
+ addMetadata(name, std::move(provider), mimetype);
+ }
+
+ void Creator::addMetadata(const std::string& name, std::unique_ptr<ContentProvider> provider, const std::string& mimetype)
+ {
+ auto compressContent = isCompressibleMimetype(mimetype);
+ auto dirent = data->createDirent(NS::M, name, mimetype, "");
+ data->addItemData(dirent, std::move(provider), compressContent);
+ data->handle(dirent);
+ }
+
+ void Creator::addIllustration(unsigned int size, const std::string& content)
+ {
+ auto provider = std::unique_ptr<ContentProvider>(new StringProvider(content));
+ addIllustration(size, std::move(provider));
+ }
+
+ void Creator::addIllustration(unsigned int size, std::unique_ptr<ContentProvider> provider)
+ {
+ std::stringstream ss;
+ ss << "Illustration_" << size << "x" << size << "@1";
+ addMetadata(ss.str(), std::move(provider), "image/png");
+ }
+
+ void Creator::addRedirection(const std::string& path, const std::string& title, const std::string& targetPath, const Hints& hints)
+ {
+ auto dirent = data->createRedirectDirent(NS::C, path, title, NS::C, targetPath);
+ if (data->dirents.size()%1000 == 0){
+ TPROGRESS();
+ }
+
+ data->handle(dirent, hints);
+ }
+
+ void Creator::finishZimCreation()
+ {
+ // Create a redirection for the mainPage.
+ // We need to keep the created dirent to set the fileheader.
+ // Dirent doesn't have to be deleted.
+ if (!m_mainPath.empty()) {
+ data->mainPageDirent = data->createRedirectDirent(NS::W, "mainPage", "", NS::C, m_mainPath);
+ data->handle(data->mainPageDirent);
+ }
+
+ TPROGRESS();
+
+ // mp_titleListingHandler is a special case, it have to handle all dirents (including itself)
+ for(auto& handler:data->m_direntHandlers) {
+ // This silently create all the needed dirents.
+ for(auto dirent:handler->getDirents()) {
+ data->mp_titleListingHandler->handle(dirent, Hints());
+ }
+ }
+
+ // Now we have all the dirents (but not the data), we must correctly set/fix the dirents
+ // before we ask data to the handlers
+ TINFO("ResolveRedirectIndexes");
+ data->resolveRedirectIndexes();
+
+ TINFO("Set entry indexes");
+ data->setEntryIndexes();
+
+ TINFO("Resolve mimetype");
+ data->resolveMimeTypes();
+
+ // We can now stop the direntHandlers, and get their content
+ bool titleListDirentSeen = false;
+ for(auto& handler:data->m_direntHandlers) {
+ handler->stop();
+ const auto& dirents = handler->getDirents();
+ if (dirents.empty()) {
+ continue;
+ }
+ auto providers = handler->getContentProviders();
+ ASSERT(dirents.size(), ==, providers.size());
+ auto provider_it = providers.begin();
+ for(auto& dirent:dirents) {
+ // As we use a "handler level" isCompressible, all content of the same handler
+ // must have the same compression.
+ data->addItemData(dirent, std::move(*provider_it), handler->isCompressible());
+ if (handler == data->mp_titleListingHandler && !titleListDirentSeen) {
+ // We have to get the offset of the titleList in the cluster before
+ // we close the cluster. Once the cluster is close, the offset information is dropped.
+ // This works only if titleListingHandler create the full (V0) titlelist in its first dirent.
+ data->m_titleListBlobOffset = data->uncompCluster->getBlobOffset(dirent->getBlobNumber());
+ titleListDirentSeen = true;
+ }
+ provider_it++;
+ }
+ }
+
+ // All the data has been added, we can now close all clusters
+ if (data->compCluster->count())
+ data->closeCluster(true);
+
+ if (data->uncompCluster->count())
+ data->closeCluster(false);
+
+ TINFO("Waiting for workers");
+ // wait all cluster compression has been done
+ unsigned int wait = 0;
+ do {
+ microsleep(wait);
+ wait += 10;
+ } while(ClusterTask::waiting_task.load() > 0);
+
+ data->quitAllThreads();
+
+ // Delete all handler (they will clean there own data)
+ data->m_direntHandlers.clear();
+
+ TINFO(data->dirents.size() << " title index created");
+ TINFO(data->clustersList.size() << " clusters created");
+
+ TINFO("write zimfile :");
+ writeLastParts();
+ ::close(data->out_fd);
+
+ TINFO("rename tmpfile to final one.");
+ DEFAULTFS::rename(data->tmpFileName, data->zimName);
+
+ TINFO("finish");
+ }
+
+ void Creator::fillHeader(Fileheader* header) const
+ {
+ header->setMainPage(
+ data->mainPageDirent
+ ? entry_index_type(data->mainPageDirent->getIdx())
+ : std::numeric_limits<entry_index_type>::max());
+ header->setLayoutPage(std::numeric_limits<entry_index_type>::max());
+
+ header->setUuid( m_uuid );
+ header->setArticleCount( data->dirents.size() );
+
+ header->setMimeListPos( Fileheader::size );
+
+ // We assume here that titleListingHandler create the V0 listing in its first dirent.
+ auto cluster = data->mp_titleListingHandler->getDirents()[0]->getCluster();
+ header->setTitleIdxPos(
+ offset_type(cluster->getOffset() + cluster->getDataOffset() + data->m_titleListBlobOffset));
+
+ header->setClusterCount( data->clustersList.size() );
+ }
+
+ void Creator::writeLastParts() const
+ {
+ Fileheader header;
+ fillHeader(&header);
+
+ int out_fd = data->out_fd;
+
+ lseek(out_fd, header.getMimeListPos(), SEEK_SET);
+ TINFO(" write mimetype list");
+ for(auto& mimeType: data->mimeTypesList)
+ {
+ _write(out_fd, mimeType.c_str(), mimeType.size()+1);
+ }
+
+ _write(out_fd, "", 1);
+
+ ASSERT(lseek(out_fd, 0, SEEK_CUR), <, CLUSTER_BASE_OFFSET);
+
+ TINFO(" write directory entries");
+ lseek(out_fd, 0, SEEK_END);
+ for (Dirent* dirent: data->dirents)
+ {
+ dirent->setOffset(offset_t(lseek(out_fd, 0, SEEK_CUR)));
+ dirent->write(out_fd);
+ }
+
+ TINFO(" write url prt list");
+ header.setUrlPtrPos(lseek(out_fd, 0, SEEK_CUR));
+ for (auto& dirent: data->dirents)
+ {
+ char tmp_buff[sizeof(offset_type)];
+ toLittleEndian(dirent->getOffset(), tmp_buff);
+ _write(out_fd, tmp_buff, sizeof(offset_type));
+ }
+
+ TINFO(" write cluster offset list");
+ header.setClusterPtrPos(lseek(out_fd, 0, SEEK_CUR));
+ for (auto cluster : data->clustersList)
+ {
+ char tmp_buff[sizeof(offset_type)];
+ toLittleEndian(cluster->getOffset(), tmp_buff);
+ _write(out_fd, tmp_buff, sizeof(offset_type));
+ }
+
+ header.setChecksumPos(lseek(out_fd, 0, SEEK_CUR));
+
+ TINFO(" write header");
+ lseek(out_fd, 0, SEEK_SET);
+ header.write(out_fd);
+
+ TINFO(" write checksum");
+ struct zim_MD5_CTX md5ctx;
+ unsigned char batch_read[1024+1];
+ lseek(out_fd, 0, SEEK_SET);
+ zim_MD5Init(&md5ctx);
+ while (true) {
+ auto r = read(out_fd, batch_read, 1024);
+ if (r == -1) {
+ perror("Cannot read");
+ throw std::runtime_error("oups");
+ }
+ if (r == 0)
+ break;
+ batch_read[r] = 0;
+ zim_MD5Update(&md5ctx, batch_read, r);
+ }
+ unsigned char digest[16];
+ zim_MD5Final(digest, &md5ctx);
+ _write(out_fd, reinterpret_cast<const char*>(digest), 16);
+ }
+
+ CreatorData::CreatorData(const std::string& fname,
+ bool verbose,
+ bool withIndex,
+ std::string language,
+ Compression c,
+ size_t clusterSize)
+ : mainPageDirent(nullptr),
+ compression(c),
+ zimName(fname),
+ tmpFileName(fname + ".tmp"),
+ clusterSize(clusterSize),
+ withIndex(withIndex),
+ indexingLanguage(language),
+ verbose(verbose),
+ nbRedirectItems(0),
+ nbCompItems(0),
+ nbUnCompItems(0),
+ nbClusters(0),
+ nbCompClusters(0),
+ nbUnCompClusters(0),
+ start_time(time(NULL))
+ {
+#ifdef _WIN32
+ int flag = _O_RDWR | _O_CREAT | _O_TRUNC | _O_BINARY;
+ int mode = _S_IREAD | _S_IWRITE;
+#else
+ int flag = O_RDWR | O_CREAT | O_TRUNC;
+ mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+#endif
+ out_fd = open(tmpFileName.c_str(), flag, mode);
+ if (out_fd == -1){
+ perror(nullptr);
+ std::ostringstream ss;
+ ss << "Cannot create file " << tmpFileName;
+ throw std::runtime_error(ss.str());
+ }
+ if(lseek(out_fd, CLUSTER_BASE_OFFSET, SEEK_SET) != CLUSTER_BASE_OFFSET) {
+ close(out_fd);
+ perror(nullptr);
+ throw std::runtime_error("Impossible to seek in file");
+ }
+
+ // We keep both a "compressed cluster" and an "uncompressed cluster"
+ // because we don't know which one will fill up first. We also need
+ // to track the dirents currently in each, so we can fix up the
+ // cluster index if the other one ends up written first.
+ compCluster = new Cluster(compression);
+ uncompCluster = new Cluster(Compression::None);
+
+#if defined(ENABLE_XAPIAN)
+ auto xapianIndexer = std::make_shared<XapianHandler>(this, withIndex);
+ m_direntHandlers.push_back(xapianIndexer);
+#endif
+
+ mp_titleListingHandler = std::make_shared<TitleListingHandler>(this);
+ m_direntHandlers.push_back(mp_titleListingHandler);
+ m_direntHandlers.push_back(std::make_shared<CounterHandler>(this));
+
+ for(auto& handler:m_direntHandlers) {
+ handler->start();
+ }
+ }
+
+ CreatorData::~CreatorData()
+ {
+ if (compCluster)
+ delete compCluster;
+ if (uncompCluster)
+ delete uncompCluster;
+ for(auto& cluster: clustersList) {
+ delete cluster;
+ }
+ quitAllThreads();
+ }
+
+ void CreatorData::quitAllThreads() {
+ // Quit all workerThreads
+ for (auto i=0U; i< workerThreads.size(); i++) {
+ taskList.pushToQueue(nullptr);
+ }
+ for(auto& thread: workerThreads) {
+ thread.join();
+ }
+ workerThreads.clear();
+
+ // Wait for writerThread to finish.
+ if (writerThread.joinable()) {
+ clusterToWrite.pushToQueue(nullptr);
+ writerThread.join();
+ }
+ }
+
+ void CreatorData::addDirent(Dirent* dirent)
+ {
+ auto ret = dirents.insert(dirent);
+ if (!ret.second) {
+ Dirent* existing = *ret.first;
+ if (existing->isRedirect() && !dirent->isRedirect()) {
+ unresolvedRedirectDirents.erase(existing);
+ dirents.erase(ret.first);
+ dirents.insert(dirent);
+ } else {
+ std::cerr << "Impossible to add " << NsAsChar(dirent->getNamespace()) << "/" << dirent->getPath() << std::endl;
+ std::cerr << " dirent's title to add is : " << dirent->getTitle() << std::endl;
+ std::cerr << " existing dirent's title is : " << existing->getTitle() << std::endl;
+ return;
+ }
+ };
+
+ // If this is a redirect, we're done: there's no blob to add.
+ if (dirent->isRedirect())
+ {
+ unresolvedRedirectDirents.insert(dirent);
+ nbRedirectItems++;
+ return;
+ }
+ }
+
+ void CreatorData::addItemData(Dirent* dirent, std::unique_ptr<ContentProvider> provider, bool compressContent)
+ {
+ // Add blob data to compressed or uncompressed cluster.
+ auto itemSize = provider->getSize();
+ if (itemSize > 0)
+ {
+ isEmpty = false;
+ }
+
+ auto cluster = compressContent ? compCluster : uncompCluster;
+
+ // If cluster will be too large, write it to dis, and open a new
+ // one for the content.
+ if ( cluster->count()
+ && cluster->size().v+itemSize >= clusterSize
+ )
+ {
+ log_info("cluster with " << cluster->count() << " items, " <<
+ cluster->size() << " bytes; current title \"" <<
+ dirent->getTitle() << '\"');
+ cluster = closeCluster(compressContent);
+ }
+
+ dirent->setCluster(cluster);
+ cluster->addContent(std::move(provider));
+
+ if (compressContent) {
+ nbCompItems++;
+ } else {
+ nbUnCompItems++;
+ }
+
+ }
+
+ Dirent* CreatorData::createDirent(NS ns, const std::string& path, const std::string& mimetype, const std::string& title)
+ {
+ auto dirent = pool.getClassicDirent(ns, path, title, getMimeTypeIdx(mimetype));
+ addDirent(dirent);
+ return dirent;
+ }
+
+ Dirent* CreatorData::createItemDirent(const Item* item)
+ {
+ auto path = item->getPath();
+ auto mimetype = item->getMimeType();
+ if (mimetype.empty()) {
+ std::cerr << "Warning, " << item->getPath() << " have empty mimetype." << std::endl;
+ mimetype = "application/octet-stream";
+ }
+ return createDirent(NS::C, item->getPath(), mimetype, item->getTitle());
+ }
+
+ Dirent* CreatorData::createRedirectDirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath)
+ {
+ auto dirent = pool.getRedirectDirent(ns, path, title, targetNs, targetPath);
+ addDirent(dirent);
+ return dirent;
+ }
+
+ Cluster* CreatorData::closeCluster(bool compressed)
+ {
+ Cluster *cluster;
+ nbClusters++;
+ if (compressed )
+ {
+ cluster = compCluster;
+ nbCompClusters++;
+ } else {
+ cluster = uncompCluster;
+ nbUnCompClusters++;
+ }
+ cluster->setClusterIndex(cluster_index_t(clustersList.size()));
+ clustersList.push_back(cluster);
+ taskList.pushToQueue(new ClusterTask(cluster));
+ clusterToWrite.pushToQueue(cluster);
+
+ if (compressed)
+ {
+ cluster = compCluster = new Cluster(compression);
+ } else {
+ cluster = uncompCluster = new Cluster(Compression::None);
+ }
+ return cluster;
+ }
+
+ void CreatorData::setEntryIndexes()
+ {
+ // set index
+ INFO("set index");
+ entry_index_t idx(0);
+ for (auto& dirent: dirents) {
+ dirent->setIdx(idx);
+ idx += 1;
+ }
+ }
+
+ void CreatorData::resolveRedirectIndexes()
+ {
+ // translate redirect aid to index
+ INFO("Resolve redirect");
+ for (auto dirent: unresolvedRedirectDirents)
+ {
+ Dirent tmpDirent(dirent->getRedirectNs(), dirent->getRedirectPath());
+ auto target_pos = dirents.find(&tmpDirent);
+ if(target_pos == dirents.end()) {
+ INFO("Invalid redirection "
+ << NsAsChar(dirent->getNamespace()) << '/' << dirent->getPath()
+ << " redirecting to (missing) "
+ << NsAsChar(dirent->getRedirectNs()) << '/' << dirent->getRedirectPath());
+ dirents.erase(dirent);
+ dirent->markRemoved();
+ if (dirent == mainPageDirent) {
+ mainPageDirent = nullptr;
+ }
+ } else {
+ dirent->setRedirect(*target_pos);
+ }
+ }
+ }
+
+ void CreatorData::resolveMimeTypes()
+ {
+ std::vector<std::string> oldMImeList;
+ std::vector<uint16_t> mapping;
+
+ for (auto& rmimeType: rmimeTypesMap)
+ {
+ oldMImeList.push_back(rmimeType.second);
+ mimeTypesList.push_back(rmimeType.second);
+ }
+
+ mapping.resize(oldMImeList.size());
+ std::sort(mimeTypesList.begin(), mimeTypesList.end());
+
+ for (unsigned i=0; i<oldMImeList.size(); ++i)
+ {
+ for (unsigned j=0; j<mimeTypesList.size(); ++j)
+ {
+ if (oldMImeList[i] == mimeTypesList[j])
+ mapping[i] = static_cast<uint16_t>(j);
+ }
+ }
+
+ for (auto& dirent: dirents)
+ {
+ if (dirent->isItem())
+ dirent->setMimeType(mapping[dirent->getMimeType()]);
+ }
+ }
+
+ uint16_t CreatorData::getMimeTypeIdx(const std::string& mimeType)
+ {
+ auto it = mimeTypesMap.find(mimeType);
+ if (it == mimeTypesMap.end())
+ {
+ if (nextMimeIdx >= std::numeric_limits<uint16_t>::max())
+ throw std::runtime_error("too many distinct mime types");
+ mimeTypesMap[mimeType] = nextMimeIdx;
+ rmimeTypesMap[nextMimeIdx] = mimeType;
+ return nextMimeIdx++;
+ }
+
+ return it->second;
+ }
+
+ const std::string& CreatorData::getMimeType(uint16_t mimeTypeIdx) const
+ {
+ auto it = rmimeTypesMap.find(mimeTypeIdx);
+ if (it == rmimeTypesMap.end())
+ throw std::runtime_error("mime type index not found");
+ return it->second;
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2018-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2021 Manessh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2020 Veloman Yunkan
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_CREATOR_DATA_H
+#define ZIM_WRITER_CREATOR_DATA_H
+
+#include <zim/writer/item.h>
+#include "queue.h"
+#include "_dirent.h"
+#include "workers.h"
+#include "handler.h"
+#include <set>
+#include <vector>
+#include <map>
+#include <fstream>
+#include <thread>
+#include "config.h"
+
+#include "../fileheader.h"
+#include "direntPool.h"
+#include "titleListingHandler.h"
+
+namespace zim
+{
+ namespace writer
+ {
+ struct UrlCompare {
+ bool operator() (const Dirent* d1, const Dirent* d2) const {
+ return compareUrl(d1, d2);
+ }
+ };
+
+ class Cluster;
+ class CreatorData
+ {
+ public:
+ typedef std::set<Dirent*, UrlCompare> UrlSortedDirents;
+ typedef std::map<std::string, uint16_t> MimeTypesMap;
+ typedef std::map<uint16_t, std::string> RMimeTypesMap;
+ typedef std::vector<std::string> MimeTypesList;
+ typedef std::vector<Cluster*> ClusterList;
+ typedef Queue<Cluster*> ClusterQueue;
+ typedef Queue<Task*> TaskQueue;
+ typedef std::vector<std::thread> ThreadList;
+
+ CreatorData(const std::string& fname, bool verbose,
+ bool withIndex, std::string language,
+ Compression compression,
+ size_t clusterSize);
+ virtual ~CreatorData();
+
+ void addDirent(Dirent* dirent);
+ void addItemData(Dirent* dirent, std::unique_ptr<ContentProvider> provider, bool compressContent);
+
+ Dirent* createDirent(NS ns, const std::string& path, const std::string& mimetype, const std::string& title);
+ Dirent* createItemDirent(const Item* item);
+ Dirent* createRedirectDirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath);
+ Cluster* closeCluster(bool compressed);
+
+ void setEntryIndexes();
+ void resolveRedirectIndexes();
+ void resolveMimeTypes();
+
+ uint16_t getMimeTypeIdx(const std::string& mimeType);
+ const std::string& getMimeType(uint16_t mimeTypeIdx) const;
+
+ void quitAllThreads();
+
+ DirentPool pool;
+
+ UrlSortedDirents dirents;
+ UrlSortedDirents unresolvedRedirectDirents;
+ Dirent* mainPageDirent;
+
+ MimeTypesMap mimeTypesMap;
+ RMimeTypesMap rmimeTypesMap;
+ MimeTypesList mimeTypesList;
+ uint16_t nextMimeIdx = 0;
+
+ ClusterList clustersList;
+ ClusterQueue clusterToWrite;
+ TaskQueue taskList;
+ ThreadList workerThreads;
+ std::thread writerThread;
+ const Compression compression;
+ std::string zimName;
+ std::string tmpFileName;
+ bool isEmpty = true;
+ size_t clusterSize;
+ Cluster *compCluster = nullptr;
+ Cluster *uncompCluster = nullptr;
+ int out_fd;
+
+ bool withIndex;
+ std::string indexingLanguage;
+
+ std::shared_ptr<TitleListingHandler> mp_titleListingHandler;
+ offset_t m_titleListBlobOffset; // The offset the title list blob,
+ // related to the beginning of the start of cluster's data.
+ std::vector<std::shared_ptr<DirentHandler>> m_direntHandlers;
+ void handle(Dirent* dirent, const Hints& hints = Hints()) {
+ for(auto& handler: m_direntHandlers) {
+ handler->handle(dirent, hints);
+ }
+ }
+ void handle(Dirent* dirent, std::shared_ptr<Item> item) {
+ for(auto& handler: m_direntHandlers) {
+ handler->handle(dirent, item);
+ }
+ }
+
+ // Some stats
+ bool verbose;
+ entry_index_type nbItems;
+ entry_index_type nbRedirectItems;
+ entry_index_type nbCompItems;
+ entry_index_type nbUnCompItems;
+ cluster_index_type nbClusters;
+ cluster_index_type nbCompClusters;
+ cluster_index_type nbUnCompClusters;
+ time_t start_time;
+
+ cluster_index_t clusterCount() const
+ { return cluster_index_t(clustersList.size()); }
+
+ entry_index_t itemCount() const
+ { return entry_index_t(dirents.size()); }
+ };
+
+ }
+
+}
+
+#endif // ZIM_WRITER_CREATOR_DATA_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_DEFAULTINDEXDATA_H
+#define ZIM_WRITER_DEFAULTINDEXDATA_H
+
+#include <zim/writer/item.h>
+#include "xapian/myhtmlparse.h"
+#include "../tools.h"
+
+#include <atomic>
+#include <mutex>
+#include <sstream>
+
+namespace zim
+{
+ namespace writer
+ {
+ class DefaultIndexData : public IndexData {
+ public:
+ DefaultIndexData(std::unique_ptr<ContentProvider> contentProvider, const std::string& title)
+ : m_initialized(false),
+ mp_contentProvider(std::move(contentProvider)),
+#if defined(ENABLE_XAPIAN)
+ m_title(zim::removeAccents(title)),
+#else
+ m_title(""),
+#endif
+ m_hasIndexData(false),
+ m_content(""),
+ m_keywords(""),
+ m_wordCount(0),
+ m_geoPosition(std::make_tuple(false, 0, 0))
+ {}
+
+ void initialize() const {
+ if (m_initialized) {
+ return;
+ }
+ std::lock_guard<std::mutex> lock(m_initLock);
+ // We have to do a double check to be sure that two call on a un-initialized object
+ // will not be initiialized twice.
+ if (m_initialized) {
+ return;
+ }
+#if defined(ENABLE_XAPIAN)
+ std::ostringstream ss;
+ while (true) {
+ auto blob = mp_contentProvider->feed();
+ if(blob.size() == 0) {
+ break;
+ }
+ ss << blob;
+ }
+ MyHtmlParser htmlParser;
+ try {
+ htmlParser.parse_html(ss.str(), "UTF-8", true);
+ } catch(...) {}
+ m_hasIndexData = !htmlParser.dump.empty() && htmlParser.indexing_allowed && (htmlParser.dump.find("NOINDEX") == std::string::npos);
+ m_content = zim::removeAccents(htmlParser.dump);
+ m_keywords = zim::removeAccents(htmlParser.keywords);
+ m_wordCount = zim::countWords(htmlParser.dump);
+ if(htmlParser.has_geoPosition) {
+ m_geoPosition = std::make_tuple(true, htmlParser.latitude, htmlParser.longitude);
+ }
+#endif
+ m_initialized = true;
+ }
+
+ bool hasIndexData() const {
+ initialize();
+ return m_hasIndexData;
+ }
+
+ std::string getTitle() const {
+ return m_title;
+ }
+
+ std::string getContent() const {
+ initialize();
+ return m_content;
+ }
+
+ std::string getKeywords() const {
+ initialize();
+ return m_keywords;
+ }
+
+ uint32_t getWordCount() const {
+ initialize();
+ return m_wordCount;
+ }
+
+ GeoPosition getGeoPosition() const
+ {
+ initialize();
+ return m_geoPosition;
+ }
+
+ private:
+ mutable std::atomic<bool> m_initialized;
+ mutable std::mutex m_initLock;
+ std::unique_ptr<ContentProvider> mp_contentProvider;
+ std::string m_title;
+ mutable bool m_hasIndexData;
+ mutable std::string m_content;
+ mutable std::string m_keywords;
+ mutable uint32_t m_wordCount;
+ mutable GeoPosition m_geoPosition;
+ };
+ }
+}
+
+#endif // ZIM_WRITER_DEFAULTINDEXDATA_H
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "_dirent.h"
+#include <zim/zim.h>
+#include "buffer.h"
+#include "endian_tools.h"
+#include "log.h"
+#include <algorithm>
+#include <cstring>
+#ifdef _WIN32
+# include <io.h>
+#else
+# include <unistd.h>
+# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \
+{throw std::runtime_error("Error writing");}
+#endif
+
+log_define("zim.dirent")
+
+namespace zim {
+namespace writer {
+
+char NsAsChar(NS ns) {
+ switch(ns) {
+ case NS::C: return 'C';
+ case NS::M: return 'M';
+ case NS::W: return 'W';
+ case NS::X: return 'X';
+ }
+ throw std::runtime_error("Invalid namespace value.");
+}
+
+// Creator for a "classic" dirent
+Dirent::Dirent(NS ns, const std::string& path, const std::string& title, uint16_t mimetype)
+ : pathTitle(path, title),
+ mimeType(mimetype),
+ idx(0),
+ info(DirentInfo::Direct()),
+ offset(0),
+ _ns(static_cast<uint8_t>(ns)),
+ removed(false),
+ frontArticle(false)
+{}
+
+// Creator for a "redirection" dirent
+Dirent::Dirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath)
+ : pathTitle(path, title),
+ mimeType(redirectMimeType),
+ idx(0),
+ info(std::move(DirentInfo::Redirect(targetNs, targetPath))),
+ offset(0),
+ _ns(static_cast<uint8_t>(ns)),
+ removed(false),
+ frontArticle(false)
+{}
+
+NS Dirent::getRedirectNs() const {
+ return info.getRedirect().ns;
+}
+
+std::string Dirent::getRedirectPath() const {
+ return info.getRedirect().targetPath;
+}
+
+void Dirent::write(int out_fd) const
+{
+ const static char zero = 0;
+ union
+ {
+ char d[16];
+ long a;
+ } header;
+ zim::toLittleEndian(getMimeType(), header.d);
+ header.d[2] = 0; // parameter size
+ header.d[3] = NsAsChar(getNamespace());
+
+ log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size());
+
+ zim::toLittleEndian(getVersion(), header.d + 4);
+
+ if (isRedirect())
+ {
+ zim::toLittleEndian(getRedirectIndex().v, header.d + 8);
+ _write(out_fd, header.d, 12);
+ }
+ else
+ {
+ zim::toLittleEndian(zim::cluster_index_type(getClusterNumber()), header.d + 8);
+ zim::toLittleEndian(zim::blob_index_type(getBlobNumber()), header.d + 12);
+ _write(out_fd, header.d, 16);
+ }
+
+ _write(out_fd, pathTitle.data(), pathTitle.size());
+ _write(out_fd, &zero, 1);
+
+}
+
+}
+}
--- /dev/null
+/*
+ * Copyright (C) 2019-2021 Matthieu Gautier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_DIRENTPOOL_H
+#define ZIM_WRITER_DIRENTPOOL_H
+
+#include "debug.h"
+#include "_dirent.h"
+
+namespace zim
+{
+ namespace writer {
+ class DirentPool {
+ private:
+ std::vector<Dirent*> pools;
+ uint16_t direntIndex;
+
+ void allocate_new_pool() {
+ pools.push_back(reinterpret_cast<Dirent*>(new char[sizeof(Dirent)*0xFFFF]));
+ direntIndex = 0;
+ }
+ static void destroyPoolBlock(Dirent* pool, uint16_t count=0xFFFF) {
+ for (auto i = 0U; i < count; i++) {
+ try {
+ pool[i].~Dirent();
+ } catch (...){ /*discard*/ }
+ }
+ delete [] (reinterpret_cast<char*>(pool));
+ }
+
+
+ public:
+ DirentPool() :
+ direntIndex(0xFFFF)
+ {}
+ DirentPool(const DirentPool&) = delete;
+ DirentPool& operator=(const DirentPool&) = delete;
+ ~DirentPool() {
+ auto nbPools = pools.size();
+ if (nbPools == 0) {
+ return;
+ }
+ // Delete all but last pools (add call the destructors of the dirents)
+ for (auto i = 0U; i<nbPools-1; i++) {
+ destroyPoolBlock(pools[i]);
+ }
+ // On the last pool, only `direntIndex` are really constructed.
+ destroyPoolBlock(pools[nbPools-1], direntIndex);
+ }
+
+ Dirent* getClassicDirent(NS ns, const std::string& path, const std::string& title, uint16_t mimetype) {
+ if (direntIndex == 0xFFFF) {
+ allocate_new_pool();
+ }
+ auto dirent = pools.back() + direntIndex++;
+ new (dirent) Dirent(ns, path, title, mimetype);
+ return dirent;
+ }
+
+ Dirent* getRedirectDirent(NS ns, const std::string& path, const std::string& title, NS targetNs, const std::string& targetPath) {
+ if (direntIndex == 0xFFFF) {
+ allocate_new_pool();
+ }
+ auto dirent = pools.back() + direntIndex++;
+ new (dirent) Dirent(ns, path, title, targetNs, targetPath);
+ return dirent;
+ }
+ };
+ }
+}
+
+#endif // ZIM_WRITER_DIRENTPOLL_H
+
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_WRITER_HANDLER_H
+#define OPENZIM_LIBZIM_WRITER_HANDLER_H
+
+#include <string>
+#include <memory>
+#include <vector>
+
+#include <zim/writer/item.h>
+
+namespace zim {
+namespace writer {
+
+class CreatorData;
+class ContentProvider;
+class Dirent;
+
+/**
+ * DirentHandler is used to add "extra" handling on dirent/item.
+ *
+ * The main purpose of the handle is to "see" all dirents corresponding to user entries
+ * and generate it's own dirent/item.
+ *
+ * Classical use cases are :
+ * - Generating a index of the item (xapianIndex)
+ * - Generating a listing of the item (all item or "main" entries only)
+ * - Count mimetypes
+ * - ...
+ *
+ * The workflow is the following:
+ * - Start the handler with `start()`.
+ * - Pass dirents to handle using `handle()`.
+ * If a handler has to handle itself, it has to do it itself before (in start/stop, ...)
+ * The handlers will NOT have dirents of other handlers passed.
+ * (Exception made for titleListingHandle)
+ * - Get the dirents associated to the handler using `createDirents()`.
+ * Handler must created dirents if entry/entries associated to it must be created.
+ * It may create several dirents if several entries must be created.
+ * It may return a empty vector (no dirent) if no entry must be created (empty listing,...).
+ * - All dirents are correctly set (redirect resolved, index and mimetype set, ...)
+ * - Stop the handler with `stop()`.
+ * - Get the content of the handler is taken using `getContentProviders`.
+ * Handle MUST returns the same number of contentProvider that the number of dirents it has returned.
+ *
+ * While it seems that DirentHandler is dynamically (de)activated by user it is not.
+ * This is purelly a internal structure to simplify the internal architecture of the writer.
+ */
+class DirentHandler {
+ public:
+ explicit DirentHandler(CreatorData* data);
+ virtual ~DirentHandler() = default;
+ using ContentProviders = std::vector<std::unique_ptr<ContentProvider>>;
+ using Dirents = std::vector<Dirent*>;
+
+ virtual void start() = 0;
+ virtual void stop() = 0;
+ virtual bool isCompressible() = 0;
+ const Dirents& getDirents() {
+ if (!m_direntsCreated) {
+ m_dirents = createDirents();
+ m_direntsCreated = true;
+ }
+ return m_dirents;
+ }
+ virtual ContentProviders getContentProviders() const = 0;
+
+ /*
+ * Handle a dirent/item.
+ *
+ * item may be nullptr (dirent is a redirect or in special case)
+ */
+ virtual void handle(Dirent* dirent, std::shared_ptr<Item> item) = 0;
+ virtual void handle(Dirent* dirent, const Hints& hints) = 0;
+
+ protected:
+ virtual Dirents createDirents() const = 0;
+ DirentHandler() = default;
+
+ private:
+ Dirents m_dirents;
+ bool m_direntsCreated {false};
+};
+
+}
+}
+
+#endif // OPENZIM_LIBZIM_WRITER_HANDLER_H
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/writer/item.h>
+#include <zim/writer/contentProvider.h>
+#include "defaultIndexData.h"
+
+namespace zim
+{
+ namespace writer
+ {
+ std::shared_ptr<IndexData> Item::getIndexData() const
+ {
+ if (getMimeType().find("text/html")!=0) {
+ return nullptr;
+ }
+
+ auto provider = getContentProvider();
+ return std::make_shared<DefaultIndexData>(std::move(provider), getTitle());
+ }
+
+ Hints Item::getHints() const {
+ return Hints();
+ }
+
+ Hints Item::getAmendedHints() const {
+ auto hints = getHints();
+
+ // If not FRONT_ARTICLE hints is given, determine it from the mimetype.
+ if (hints.find(FRONT_ARTICLE) == hints.end()) {
+ hints[FRONT_ARTICLE] = (getMimeType().find("text/html") == 0);
+ }
+
+ // If not COMPRESS hints is given, determine it from the mimetype.
+ if (hints.find(COMPRESS) == hints.end()) {
+ hints[COMPRESS] = isCompressibleMimetype(getMimeType());
+ }
+ return hints;
+ }
+
+ std::unique_ptr<ContentProvider> StringItem::getContentProvider() const
+ {
+ auto shared_string = std::shared_ptr<const std::string>(shared_from_this(), &content);
+ return std::unique_ptr<ContentProvider>(new SharedStringProvider(shared_string));
+ }
+
+ std::unique_ptr<ContentProvider> FileItem::getContentProvider() const
+ {
+ return std::unique_ptr<ContentProvider>(new FileProvider(filepath));
+ }
+
+
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2016-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_QUEUE_H
+#define OPENZIM_LIBZIM_QUEUE_H
+
+#define MAX_QUEUE_SIZE 10
+
+#include <mutex>
+#include <queue>
+#include "../tools.h"
+
+template<typename T>
+class Queue {
+ public:
+ Queue() = default;
+ virtual ~Queue() = default;
+ virtual bool isEmpty();
+ virtual size_t size();
+ virtual void pushToQueue(const T& element);
+ virtual bool getHead(T &element);
+ virtual bool popFromQueue(T &element);
+
+ protected:
+ std::queue<T> m_realQueue;
+ std::mutex m_queueMutex;
+
+ private:
+ // Make this queue non copyable
+ Queue(const Queue&);
+ Queue& operator=(const Queue&);
+};
+
+template<typename T>
+bool Queue<T>::isEmpty() {
+ std::lock_guard<std::mutex> l(m_queueMutex);
+ return m_realQueue.empty();
+}
+
+template<typename T>
+size_t Queue<T>::size() {
+ std::lock_guard<std::mutex> l(m_queueMutex);
+ return m_realQueue.size();
+}
+
+template<typename T>
+void Queue<T>::pushToQueue(const T &element) {
+ unsigned int wait = 0;
+ unsigned int queueSize = 0;
+
+ do {
+ zim::microsleep(wait);
+ queueSize = size();
+ wait += 10;
+ } while (queueSize > MAX_QUEUE_SIZE);
+
+ std::lock_guard<std::mutex> l(m_queueMutex);
+ m_realQueue.push(element);
+}
+
+template<typename T>
+bool Queue<T>::getHead(T &element) {
+ std::lock_guard<std::mutex> l(m_queueMutex);
+ if (m_realQueue.empty()) {
+ return false;
+ }
+ element = m_realQueue.front();
+ return true;
+}
+
+template<typename T>
+bool Queue<T>::popFromQueue(T &element) {
+ std::lock_guard<std::mutex> l(m_queueMutex);
+ if (m_realQueue.empty()) {
+ return false;
+ }
+
+ element = m_realQueue.front();
+ m_realQueue.pop();
+
+ return true;
+}
+
+#endif // OPENZIM_LIBZIM_QUEUE_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@mgautier.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_TINYSTRING_H
+#define ZIM_WRITER_TINYSTRING_H
+
+#include "../zim_types.h"
+#include <cstring>
+
+namespace zim
+{
+ namespace writer {
+ class TinyString {
+ public: // functions
+ TinyString() :
+ m_data(nullptr),
+ m_size(0)
+ {}
+ TinyString(const std::string& s) :
+ m_data(new char[(uint16_t)s.size()]),
+ m_size(s.size())
+ {
+ if (s.size() >= 0xFFFF) {
+ throw std::runtime_error("String len is too big");
+ }
+ std::memcpy(m_data, s.data(), m_size);
+ }
+ TinyString(TinyString&& t):
+ m_data(t.m_data),
+ m_size(t.m_size)
+ {
+ t.m_data = nullptr;
+ t.m_size = 0;
+ };
+ TinyString(const TinyString& t) = delete;
+ ~TinyString() {
+ if (m_data) {
+ delete[] m_data;
+ m_data = nullptr;
+ }
+ }
+ operator std::string() const { return std::string(m_data, m_size); }
+ bool empty() const { return m_size == 0; }
+ size_t size() const { return m_size; }
+ const char* const data() const { return m_data; }
+ bool operator==(const TinyString& other) const {
+ return (m_size == other.m_size) && (std::memcmp(m_data, other.m_data, m_size) == 0);
+ }
+ bool operator<(const TinyString& other) const {
+ auto min_size = std::min(m_size, other.m_size);
+ auto ret = std::memcmp(m_data, other.m_data, min_size);
+ if (ret == 0) {
+ return m_size < other.m_size;
+ } else {
+ return ret < 0;
+ }
+ }
+
+ protected: // members
+ char* m_data;
+ uint16_t m_size;
+ } PACKED;
+
+ class PathTitleTinyString : public TinyString {
+ public:
+ PathTitleTinyString() : TinyString() {}
+ PathTitleTinyString(const std::string& path, const std::string& title)
+ : TinyString(PathTitleTinyString::concat(path, title))
+ {}
+
+ static std::string concat(const std::string& path, const std::string& title) {
+ std::string result(path.data(), path.size()+1);
+ if ( title != path ) {
+ result += title;
+ }
+ return result;
+ }
+ std::string getPath() const {
+ if (m_size == 0) {
+ return std::string();
+ }
+ return std::string(m_data);
+ }
+ std::string getTitle(bool storedOnly) const {
+ if (m_size == 0) {
+ return std::string();
+ }
+ auto title_start = std::strlen(m_data) + 1;
+ if (title_start == m_size) {
+ if (storedOnly) {
+ return std::string(); // return empty title
+ } else {
+ return std::string(m_data); // return the path as a title
+ }
+ } else {
+ return std::string(m_data+title_start, m_size-title_start);
+ }
+ }
+ } PACKED;
+ }
+}
+
+#endif // ZIM_WRITER_TINYSTRING_H
+
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "titleListingHandler.h"
+#include "creatordata.h"
+
+#include "../endian_tools.h"
+
+#include <zim/writer/contentProvider.h>
+#include <zim/blob.h>
+
+using namespace zim::writer;
+
+namespace {
+
+class ListingProvider : public ContentProvider {
+ public:
+ ListingProvider(const TitleListingHandler::Dirents* dirents, bool frontOnly)
+ : mp_dirents(dirents),
+ m_it(dirents->begin()),
+ m_frontOnly(frontOnly)
+ {}
+
+ zim::size_type getSize() const override {
+ if (m_frontOnly) {
+ auto nbFrontArticles = std::count_if(mp_dirents->begin(), mp_dirents->end(), [](Dirent* d) { return d->isFrontArticle();});
+ return nbFrontArticles * sizeof(zim::entry_index_type);
+ } else {
+ return mp_dirents->size() * sizeof(zim::entry_index_type);
+ }
+ }
+
+ zim::Blob feed() override {
+ if (m_frontOnly) {
+ while (m_it != mp_dirents->end() && !(*m_it)->isFrontArticle()) {
+ m_it++;
+ }
+ }
+ if (m_it == mp_dirents->end()) {
+ return zim::Blob(nullptr, 0);
+ }
+ zim::toLittleEndian((*m_it)->getIdx().v, buffer);
+ m_it++;
+ return zim::Blob(buffer, sizeof(zim::entry_index_type));
+ }
+
+ private:
+ const TitleListingHandler::Dirents* mp_dirents;
+ char buffer[sizeof(zim::entry_index_type)];
+ TitleListingHandler::Dirents::const_iterator m_it;
+ bool m_frontOnly;
+};
+
+} // end of anonymous namespace
+
+TitleListingHandler::TitleListingHandler(CreatorData* data)
+ : mp_creatorData(data),
+ m_hasFrontArticles(false)
+{}
+
+TitleListingHandler::~TitleListingHandler() = default;
+
+void TitleListingHandler::start() {
+}
+
+void TitleListingHandler::stop() {
+ m_handledDirents.erase(
+ std::remove_if(m_handledDirents.begin(), m_handledDirents.end(), [](const Dirent* d) { return d->isRemoved(); }),
+ m_handledDirents.end());
+ std::sort(m_handledDirents.begin(), m_handledDirents.end(), TitleCompare());
+}
+
+DirentHandler::Dirents TitleListingHandler::createDirents() const {
+ Dirents ret;
+ ret.push_back(mp_creatorData->createDirent(NS::X, "listing/titleOrdered/v0", "application/octet-stream+zimlisting", ""));
+ if (m_hasFrontArticles) {
+ ret.push_back(mp_creatorData->createDirent(NS::X, "listing/titleOrdered/v1", "application/octet-stream+zimlisting", ""));
+ }
+ return ret;
+}
+
+DirentHandler::ContentProviders TitleListingHandler::getContentProviders() const {
+ ContentProviders ret;
+ ret.push_back(std::unique_ptr<ContentProvider>(new ListingProvider(&m_handledDirents, false)));
+ if (m_hasFrontArticles) {
+ ret.push_back(std::unique_ptr<ContentProvider>(new ListingProvider(&m_handledDirents, true)));
+ }
+ return ret;
+}
+
+void TitleListingHandler::handle(Dirent* dirent, std::shared_ptr<Item> item)
+{
+ handle(dirent, item->getAmendedHints());
+}
+
+void TitleListingHandler::handle(Dirent* dirent, const Hints& hints)
+{
+ m_handledDirents.push_back(dirent);
+
+ // By definition, dirent not in `C` namespace are not FRONT_ARTICLE
+ if (dirent->getNamespace() != NS::C) {
+ return;
+ }
+
+ try {
+ if(bool(hints.at(FRONT_ARTICLE))) {
+ m_hasFrontArticles = true;
+ dirent->setFrontArticle();
+ }
+ } catch(std::out_of_range&) {}
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_LISTING_HANDLER_H
+#define OPENZIM_LIBZIM_LISTING_HANDLER_H
+
+#include "handler.h"
+#include "_dirent.h"
+
+#include <vector>
+
+namespace zim {
+namespace writer {
+
+struct TitleCompare {
+ bool operator() (const Dirent* d1, const Dirent* d2) const {
+ return compareTitle(d1, d2);
+ }
+};
+
+// This handler is in charge of handling titles.
+// It will create the "classic" old V0 title listing (for ALL entries) but also
+// the V1 title listing (for front article only).
+class TitleListingHandler : public DirentHandler {
+ public:
+ explicit TitleListingHandler(CreatorData* data);
+ virtual ~TitleListingHandler();
+
+ void start() override;
+ void stop() override;
+ bool isCompressible() override { return false; }
+ ContentProviders getContentProviders() const override;
+ void handle(Dirent* dirent, std::shared_ptr<Item> item) override;
+ void handle(Dirent* dirent, const Hints& hints) override;
+
+ protected:
+ Dirents createDirents() const override;
+ CreatorData* mp_creatorData;
+ Dirents m_handledDirents;
+ bool m_hasFrontArticles;
+};
+}
+}
+
+#endif // OPENZIM_LIBZIM_LISTING_HANDLER_H
--- /dev/null
+/*
+ * Copyright (C) 2019-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "workers.h"
+#include "cluster.h"
+#include "creatordata.h"
+
+#include "../tools.h"
+
+#ifdef _WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+namespace zim
+{
+ namespace writer
+ {
+
+ void* taskRunner(void* arg) {
+ auto creatorData = static_cast<zim::writer::CreatorData*>(arg);
+ Task* task;
+ unsigned int wait = 0;
+
+ while(true) {
+ microsleep(wait);
+ wait += 100;
+ if (creatorData->taskList.popFromQueue(task)) {
+ if (task == nullptr) {
+ return nullptr;
+ }
+ task->run(creatorData);
+ delete task;
+ wait = 0;
+ }
+ }
+ return nullptr;
+ }
+
+ void* clusterWriter(void* arg) {
+ auto creatorData = static_cast<zim::writer::CreatorData*>(arg);
+ Cluster* cluster;
+ unsigned int wait = 0;
+ while(true) {
+ microsleep(wait);
+ wait += 100;
+ if(creatorData->clusterToWrite.getHead(cluster)) {
+ if (cluster == nullptr) {
+ // All cluster writen, we can quit
+ return nullptr;
+ }
+ if (not cluster->isClosed()) {
+ continue;
+ }
+ creatorData->clusterToWrite.popFromQueue(cluster);
+ cluster->setOffset(offset_t(lseek(creatorData->out_fd, 0, SEEK_CUR)));
+ cluster->write(creatorData->out_fd);
+ cluster->clear_data();
+ wait = 0;
+ }
+ }
+ return nullptr;
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2019-2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_WORKERS_H
+#define OPENZIM_LIBZIM_WORKERS_H
+
+namespace zim {
+namespace writer {
+
+class CreatorData;
+
+class Task {
+ public:
+ Task() = default;
+ virtual ~Task() = default;
+
+ virtual void run(CreatorData* data) = 0;
+};
+
+void* taskRunner(void* data);
+void* clusterWriter(void* data);
+
+}
+}
+
+#endif // OPENZIM_LIBZIM_WORKERS_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "xapianHandler.h"
+#include "xapianIndexer.h"
+#include "xapianWorker.h"
+#include "creatordata.h"
+
+#include <zim/writer/contentProvider.h>
+
+using namespace zim::writer;
+
+XapianHandler::XapianHandler(CreatorData* data, bool withFulltextIndex)
+ : mp_fulltextIndexer(withFulltextIndex ? new XapianIndexer(data->zimName+"_fulltext.idx", data->indexingLanguage, IndexingMode::FULL, true) : nullptr),
+ mp_titleIndexer(new XapianIndexer(data->zimName+"_title.idx", data->indexingLanguage, IndexingMode::TITLE, true)),
+ mp_creatorData(data)
+{}
+
+XapianHandler::~XapianHandler() = default;
+
+void XapianHandler::start() {
+ if (mp_fulltextIndexer) {
+ mp_fulltextIndexer->indexingPrelude();
+ }
+ mp_titleIndexer->indexingPrelude();
+}
+
+void XapianHandler::stop() {
+ // We need to wait that all indexation tasks have been done before closing the
+ // xapian database.
+ if (mp_fulltextIndexer) {
+ IndexTask::waitNoMoreTask();
+ mp_fulltextIndexer->indexingPostlude();
+ }
+ mp_titleIndexer->indexingPostlude();
+}
+
+DirentHandler::Dirents XapianHandler::createDirents() const {
+ // Wait for all task to be done before checking if we are empty.
+ Dirents ret;
+ if (mp_fulltextIndexer) {
+ IndexTask::waitNoMoreTask();
+ if (!mp_fulltextIndexer->is_empty()) {
+ ret.push_back(mp_creatorData->createDirent(NS::X, "fulltext/xapian", "application/octet-stream+xapian", ""));
+ }
+ }
+ if (!mp_titleIndexer->is_empty()) {
+ ret.push_back(mp_creatorData->createDirent(NS::X, "title/xapian", "application/octet-stream+xapian", ""));
+ }
+ return ret;
+}
+
+DirentHandler::ContentProviders XapianHandler::getContentProviders() const {
+ ContentProviders ret;
+ if (mp_fulltextIndexer && !mp_fulltextIndexer->is_empty()) {
+ ret.push_back(std::unique_ptr<ContentProvider>(new FileProvider(mp_fulltextIndexer->getIndexPath())));
+ }
+ if (!mp_titleIndexer->is_empty()) {
+ ret.push_back(std::unique_ptr<ContentProvider>(new FileProvider(mp_titleIndexer->getIndexPath())));
+ }
+ return ret;
+}
+
+void XapianHandler::indexTitle(Dirent* dirent) {
+ auto title = dirent->getRealTitle();
+ if (title.empty()) {
+ return;
+ }
+ auto path = dirent->getPath();
+ if (dirent->isRedirect()) {
+ auto redirectPath = dirent->getRedirectPath();
+ mp_titleIndexer->indexTitle(path, title, redirectPath);
+ } else {
+ mp_titleIndexer->indexTitle(path, title);
+ }
+}
+
+void XapianHandler::handle(Dirent* dirent, const Hints& hints)
+{
+ if (dirent->getNamespace() != NS::C) {
+ return;
+ }
+
+ try {
+ if (bool(hints.at(FRONT_ARTICLE))) {
+ indexTitle(dirent);
+ }
+ } catch(std::out_of_range&) {}
+}
+
+void XapianHandler::handle(Dirent* dirent, std::shared_ptr<Item> item)
+{
+ if (dirent->getNamespace() != NS::C) {
+ return;
+ }
+
+ // Title index.
+ handle(dirent, item->getAmendedHints());
+
+ // FullText index
+ if (mp_fulltextIndexer) {
+ auto indexData = item->getIndexData();
+ if (!indexData || !indexData->hasIndexData()) {
+ return;
+ }
+ auto title = indexData->getTitle();
+ auto path = dirent->getPath();
+ mp_creatorData->taskList.pushToQueue(new IndexTask(indexData, path, title, mp_fulltextIndexer.get()));
+ }
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_XAPIAN_HANDLER_H
+#define OPENZIM_LIBZIM_XAPIAN_HANDLER_H
+
+#include "handler.h"
+
+namespace zim {
+namespace writer {
+
+class XapianIndexer;
+
+class XapianHandler : public DirentHandler {
+ public:
+ XapianHandler(CreatorData* data, bool withFullTextIndex);
+ virtual ~XapianHandler();
+
+ void start() override;
+ void stop() override;
+ bool isCompressible() override { return false; }
+ ContentProviders getContentProviders() const override;
+ void handle(Dirent* dirent, std::shared_ptr<Item> item) override;
+ void handle(Dirent* dirent, const Hints& hints) override;
+
+ protected:
+ Dirents createDirents() const override;
+
+ private: // methods
+ void indexTitle(Dirent* dirent);
+
+ private: // data
+ std::unique_ptr<XapianIndexer> mp_fulltextIndexer;
+ std::unique_ptr<XapianIndexer> mp_titleIndexer;
+ CreatorData* mp_creatorData;
+};
+
+}
+}
+
+#endif // OPENZIM_LIBZIM_XAPIAN_WORKER_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2018-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2011 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "xapianIndexer.h"
+#include "libzim-resources.h"
+#include "fs.h"
+#include "tools.h"
+#include "../constants.h"
+#include <sstream>
+#include <fstream>
+#include <stdexcept>
+#include <cassert>
+
+using namespace zim::writer;
+
+/* Constructor */
+XapianIndexer::XapianIndexer(const std::string& indexPath, const std::string& language, IndexingMode indexingMode, const bool verbose)
+ : indexPath(indexPath),
+ language(language),
+ indexingMode(indexingMode)
+{
+ /* Build ICU Local object to retrieve ISO-639 language code (from
+ ISO-639-3) */
+ icu::Locale languageLocale(language.c_str());
+ stemmer_language = languageLocale.getLanguage();
+
+ /* Read the stopwords */
+ std::string stopWord;
+ try {
+ this->stopwords = getResource("stopwords/" + language);
+ } catch(ResourceNotFound& e) {}
+ std::istringstream file(this->stopwords);
+ while (std::getline(file, stopWord, '\n')) {
+ this->stopper.add(stopWord);
+ }
+}
+
+XapianIndexer::~XapianIndexer()
+{
+ if (!indexPath.empty()) {
+ try {
+#ifndef _WIN32
+//[TODO] Implement remove for windows
+ zim::DEFAULTFS::remove(indexPath + ".tmp");
+ zim::DEFAULTFS::remove(indexPath);
+#endif
+ } catch (...) {
+ /* Do not raise */
+ }
+ }
+}
+
+/*
+ * `valuesmap` is a metadata associated with the Xapian database. We are using it
+ * to attach slot numbers of each document in the index to the value they are storing.
+ * These values and slot numbers are used in collapsing, filtering etc.
+ *
+ * Title index:
+ * Slot 0: Title of the article. Used in collapsing articles with same name.
+ * Slot 1: path/redirectPath of the article. Used in collapsing duplicates(redirects).
+ *
+ * Fulltext Index:
+ * Slot 0: Title of the article. Used in collapsing articles with same name.
+ * Slot 1: Word count of the article.
+ * Slot 2: Geo position of the article. Used for geo-filtering.
+ *
+ * `kind` metadata indicate whether the database is a title or a fulltext index.
+ *
+ * `data` metadata indicate the type of data stored in the index. A value of "fullPath"
+ * means the data stores the complete path with a namespace.
+ */
+
+void XapianIndexer::indexingPrelude()
+{
+ writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE | Xapian::DB_NO_TERMLIST);
+
+ switch (indexingMode) {
+ case IndexingMode::TITLE:
+ writableDatabase.set_metadata("valuesmap", "title:0;targetPath:1");
+ writableDatabase.set_metadata("kind", "title");
+ writableDatabase.set_metadata("data", "fullPath");
+ break;
+ case IndexingMode::FULL:
+ writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1;geo.position:2");
+ writableDatabase.set_metadata("kind", "fulltext");
+ writableDatabase.set_metadata("data", "fullPath");
+ break;
+ }
+ writableDatabase.set_metadata("language", language);
+ writableDatabase.set_metadata("stopwords", stopwords);
+ writableDatabase.begin_transaction(true);
+}
+
+/*
+ * For title index, index the full path with namespace as data of the document.
+ * The targetPath in valuesmap will store the path without namespace.
+ * TODO:
+ * Currently for title index we are storing path twice (redirectPath/path in
+ * valuesmap and path in index data). In the future, we want to keep only one of
+ * these(index data if possible) to reduce index size while supporting the
+ * collapse on path feature.
+ */
+
+void XapianIndexer::indexTitle(const std::string& path, const std::string& title, const std::string& targetPath)
+{
+ assert(indexingMode == IndexingMode::TITLE);
+ Xapian::Stem stemmer;
+ Xapian::TermGenerator indexer;
+ try {
+ stemmer = Xapian::Stem(stemmer_language);
+ indexer.set_stemmer(stemmer);
+ indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
+ } catch (...) {}
+ Xapian::Document currentDocument;
+ currentDocument.clear_values();
+
+ std::string fullPath = "C/" + path;
+ currentDocument.set_data(fullPath);
+ indexer.set_document(currentDocument);
+
+ std::string unaccentedTitle = zim::removeAccents(title);
+
+ currentDocument.add_value(0, title);
+ if (targetPath.empty()) {
+ currentDocument.add_value(1, path);
+ } else {
+ currentDocument.add_value(1, targetPath);
+ }
+
+ if (!unaccentedTitle.empty()) {
+ std::string anchoredTitle = ANCHOR_TERM + unaccentedTitle;
+ indexer.index_text(anchoredTitle, 1);
+ }
+
+ /* add to the database */
+ writableDatabase.add_document(currentDocument);
+ empty = false;
+}
+
+void XapianIndexer::flush()
+{
+ this->writableDatabase.commit_transaction();
+ this->writableDatabase.begin_transaction(true);
+}
+
+void XapianIndexer::indexingPostlude()
+{
+ this->flush();
+ this->writableDatabase.commit_transaction();
+ this->writableDatabase.commit();
+ this->writableDatabase.compact(indexPath, Xapian::DBCOMPACT_SINGLE_FILE);
+ this->writableDatabase.close();
+}
+
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2018-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ * Copyright (C) 2011 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef LIBZIM_WRITER_XAPIANINDEXER_H
+#define LIBZIM_WRITER_XAPIANINDEXER_H
+
+#include <zim/writer/item.h>
+
+#include <unicode/locid.h>
+#include <xapian.h>
+#include <zim/blob.h>
+
+
+namespace zim {
+namespace writer {
+
+class IndexTask;
+
+enum class IndexingMode {
+ TITLE,
+ FULL
+};
+
+class XapianIndexer
+{
+ public:
+ XapianIndexer(const std::string& indexPath, const std::string& language, IndexingMode mode, bool verbose);
+ virtual ~XapianIndexer();
+ std::string getIndexPath() { return indexPath; }
+ void indexingPrelude();
+ void flush();
+ void indexingPostlude();
+ bool is_empty() { return empty; }
+
+ void indexTitle(const std::string& path, const std::string& title, const std::string& targetPath = "");
+
+ protected:
+ Xapian::WritableDatabase writableDatabase;
+ bool empty {true};
+ std::string stemmer_language;
+ Xapian::SimpleStopper stopper;
+ std::string indexPath;
+ std::string language;
+ std::string stopwords;
+ IndexingMode indexingMode;
+
+ friend class zim::writer::IndexTask;
+};
+
+}
+}
+
+#endif // LIBZIM_WRITER_XAPIANINDEXER_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "xapianWorker.h"
+#include "creatordata.h"
+
+#include "xapianIndexer.h"
+
+#include <stdexcept>
+#include <sstream>
+#include <mutex>
+
+static std::mutex s_dbaccessLock;
+std::atomic<unsigned long> zim::writer::IndexTask::waiting_task(0);
+
+namespace zim
+{
+ namespace writer
+ {
+
+ const unsigned int keywordsBoostFactor = 3;
+ inline unsigned int getTitleBoostFactor(const unsigned int contentLength)
+ {
+ return contentLength / 500 + 1;
+ }
+
+ void IndexTask::waitNoMoreTask() {
+ unsigned int wait = 0;
+ do {
+ microsleep(wait);
+ wait += 10;
+ } while (waiting_task.load() > 0);
+ }
+
+ void IndexTask::run(CreatorData* data) {
+ Xapian::Stem stemmer;
+ Xapian::TermGenerator indexer;
+ try {
+ stemmer = Xapian::Stem(mp_indexer->stemmer_language);
+ indexer.set_stemmer(stemmer);
+ indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL);
+ } catch (...) {
+ // No stemming for language.
+ }
+ indexer.set_stopper(&mp_indexer->stopper);
+ indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL);
+
+ Xapian::Document document;
+ indexer.set_document(document);
+
+ std::string fullPath = "C/" + m_path;
+ document.set_data(fullPath);
+ document.add_value(0, m_title);
+
+ std::stringstream countWordStringStream;
+ countWordStringStream << mp_indexData->getWordCount();
+ document.add_value(1, countWordStringStream.str());
+
+ auto geoInfo = mp_indexData->getGeoPosition();
+ if (std::get<0>(geoInfo)) {
+ auto geoPosition = Xapian::LatLongCoord(
+ std::get<1>(geoInfo), std::get<2>(geoInfo)).serialise();
+ document.add_value(2, geoPosition);
+ }
+
+ /* Index the content */
+ auto indexContent = mp_indexData->getContent();
+ if (!indexContent.empty()) {
+ indexer.index_text_without_positions(indexContent);
+ }
+
+ /* Index the title */
+ auto indexTitle = mp_indexData->getTitle();
+ if (!indexTitle.empty()) {
+ indexer.index_text_without_positions(
+ indexTitle, getTitleBoostFactor(indexContent.size()));
+ }
+
+ /* Index the keywords */
+ auto indexKeywords = mp_indexData->getKeywords();
+ if (!indexKeywords.empty()) {
+ indexer.index_text_without_positions(indexKeywords, keywordsBoostFactor);
+ }
+
+ std::lock_guard<std::mutex> l(s_dbaccessLock);
+ mp_indexer->writableDatabase.add_document(document);
+ mp_indexer->empty = false;
+ }
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2020-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_XAPIAN_WORKER_H
+#define OPENZIM_LIBZIM_XAPIAN_WORKER_H
+
+#include <atomic>
+#include <memory>
+#include "workers.h"
+#include <zim/writer/item.h>
+
+namespace zim {
+namespace writer {
+
+class Item;
+class XapianIndexer;
+
+class IndexTask : public Task {
+ public:
+ IndexTask(const IndexTask&) = delete;
+ IndexTask& operator=(const IndexTask&) = delete;
+ IndexTask(std::shared_ptr<IndexData> indexData, const std::string& path, const std::string& title, XapianIndexer* indexer) :
+ mp_indexData(indexData),
+ m_path(path),
+ m_title(title),
+ mp_indexer(indexer)
+ {
+ ++waiting_task;
+ }
+ virtual ~IndexTask()
+ {
+ --waiting_task;
+ }
+
+ static void waitNoMoreTask();
+
+ virtual void run(CreatorData* data);
+ static std::atomic<unsigned long> waiting_task;
+
+ private:
+ std::shared_ptr<IndexData> mp_indexData;
+ std::string m_path;
+ std::string m_title;
+ XapianIndexer* mp_indexer;
+};
+
+}
+}
+
+#endif // OPENZIM_LIBZIM_XAPIAN_WORKER_H
--- /dev/null
+/* htmlparse.cc: simple HTML parser for omega indexer
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2001 Ananova Ltd
+ * Copyright 2002,2006,2007,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+// #include <config.h>
+
+#include "htmlparse.h"
+
+#include <xapian.h>
+
+// #include "utf8convert.h"
+
+#include <algorithm>
+#include <mutex>
+
+#include <ctype.h>
+#include <cstring>
+#include <stdio.h>
+#include <stdlib.h>
+
+using namespace std;
+
+inline void
+lowercase_string(string &str)
+{
+ for (string::iterator i = str.begin(); i != str.end(); ++i) {
+ *i = tolower(static_cast<unsigned char>(*i));
+ }
+}
+
+map<string, unsigned int> zim::HtmlParser::named_ents;
+static std::mutex sInitLock;
+
+inline static bool
+p_notdigit(char c)
+{
+ return !isdigit(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notxdigit(char c)
+{
+ return !isxdigit(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notalnum(char c)
+{
+ return !isalnum(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notwhitespace(char c)
+{
+ return !isspace(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_nottag(char c)
+{
+ return !isalnum(static_cast<unsigned char>(c)) &&
+ c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
+}
+
+inline static bool
+p_whitespacegt(char c)
+{
+ return isspace(static_cast<unsigned char>(c)) || c == '>';
+}
+
+inline static bool
+p_whitespaceeqgt(char c)
+{
+ return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
+}
+
+bool
+zim::HtmlParser::get_parameter(const string & param, string & value)
+{
+ map<string, string>::const_iterator i = parameters.find(param);
+ if (i == parameters.end()) return false;
+ value = i->second;
+ return true;
+}
+
+zim::HtmlParser::HtmlParser()
+{
+ static const struct ent { const char *n; unsigned int v; } ents[] = {
+#include "namedentities.h"
+ { NULL, 0 }
+ };
+ std::lock_guard<std::mutex> l(sInitLock);
+ if (named_ents.empty()) {
+ const struct ent *i = ents;
+ while (i->n) {
+ named_ents[string(i->n)] = i->v;
+ ++i;
+ }
+ }
+}
+
+void
+zim::HtmlParser::decode_entities(string &s)
+{
+ // We need a const_iterator version of s.end() - otherwise the
+ // find() and find_if() templates don't work...
+ string::const_iterator amp = s.begin(), s_end = s.end();
+ while ((amp = find(amp, s_end, '&')) != s_end) {
+ unsigned int val = 0;
+ string::const_iterator end, p = amp + 1;
+ if (p != s_end && *p == '#') {
+ p++;
+ if (p != s_end && (*p == 'x' || *p == 'X')) {
+ // hex
+ p++;
+ end = find_if(p, s_end, p_notxdigit);
+ sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
+ } else {
+ // number
+ end = find_if(p, s_end, p_notdigit);
+ val = atoi(s.substr(p - s.begin(), end - p).c_str());
+ }
+ } else {
+ end = find_if(p, s_end, p_notalnum);
+ string code = s.substr(p - s.begin(), end - p);
+ map<string, unsigned int>::const_iterator i;
+ i = named_ents.find(code);
+ if (i != named_ents.end()) val = i->second;
+ }
+ if (end < s_end && *end == ';') end++;
+ if (val) {
+ string::size_type amp_pos = amp - s.begin();
+ if (val < 0x80) {
+ s.replace(amp_pos, end - amp, 1u, char(val));
+ } else {
+ // Convert unicode value val to UTF-8.
+ char seq[4];
+ unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
+ s.replace(amp_pos, end - amp, seq, len);
+ }
+ s_end = s.end();
+ // We've modified the string, so the iterators are no longer
+ // valid...
+ amp = s.begin() + amp_pos + 1;
+ } else {
+ amp = end;
+ }
+ }
+}
+
+void
+zim::HtmlParser::parse_html(const string &body)
+{
+ in_script = false;
+
+ parameters.clear();
+ string::const_iterator start = body.begin();
+
+ while (true) {
+ // Skip through until we find an HTML tag, a comment, or the end of
+ // document. Ignore isolated occurrences of `<' which don't start
+ // a tag or comment.
+ string::const_iterator p = start;
+ while (true) {
+ p = find(p, body.end(), '<');
+ if (p == body.end()) break;
+ unsigned char ch = *(p + 1);
+
+ // Tag, closing tag, or comment (or SGML declaration).
+ if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
+
+ if (ch == '?') {
+ // PHP code or XML declaration.
+ // XML declaration is only valid at the start of the first line.
+ // FIXME: need to deal with BOMs...
+ if (p != body.begin() || body.size() < 20) break;
+
+ // XML declaration looks something like this:
+ // <?xml version="1.0" encoding="UTF-8"?>
+ if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
+ if (strchr(" \t\r\n", p[5]) == NULL) break;
+
+ string::const_iterator decl_end = find(p + 6, body.end(), '?');
+ if (decl_end == body.end()) break;
+
+ // Default charset for XML is UTF-8.
+ charset = "UTF-8";
+
+ string decl(p + 6, decl_end);
+ size_t enc = decl.find("encoding");
+ if (enc == string::npos) break;
+
+ enc = decl.find_first_not_of(" \t\r\n", enc + 8);
+ if (enc == string::npos || enc == decl.size()) break;
+
+ if (decl[enc] != '=') break;
+
+ enc = decl.find_first_not_of(" \t\r\n", enc + 1);
+ if (enc == string::npos || enc == decl.size()) break;
+
+ if (decl[enc] != '"' && decl[enc] != '\'') break;
+
+ char quote = decl[enc++];
+ size_t enc_end = decl.find(quote, enc);
+
+ if (enc != string::npos)
+ charset = decl.substr(enc, enc_end - enc);
+
+ break;
+ }
+ p++;
+ }
+
+ // Process text up to start of tag.
+ if (p > start) {
+ string text = body.substr(start - body.begin(), p - start);
+ // convert_to_utf8(text, charset);
+ decode_entities(text);
+ process_text(text);
+ }
+
+ if (p == body.end()) break;
+
+ start = p + 1;
+
+ if (start == body.end()) break;
+
+ if (*start == '!') {
+ if (++start == body.end()) break;
+ if (++start == body.end()) break;
+ // comment or SGML declaration
+ if (*(start - 1) == '-' && *start == '-') {
+ ++start;
+ string::const_iterator close = find(start, body.end(), '>');
+ // An unterminated comment swallows rest of document
+ // (like Netscape, but unlike MSIE IIRC)
+ if (close == body.end()) break;
+
+ p = close;
+ // look for -->
+ while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
+ p = find(p + 1, body.end(), '>');
+
+ if (p != body.end()) {
+ // Check for htdig's "ignore this bit" comments.
+ if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
+ string::size_type i;
+ i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
+ if (i == string::npos) break;
+ start = body.begin() + i + 21;
+ continue;
+ }
+ // If we found --> skip to there.
+ start = p;
+ } else {
+ // Otherwise skip to the first > we found (as Netscape does).
+ start = close;
+ }
+ } else {
+ // just an SGML declaration, perhaps giving the DTD - ignore it
+ start = find(start - 1, body.end(), '>');
+ if (start == body.end()) break;
+ }
+ ++start;
+ } else if (*start == '?') {
+ if (++start == body.end()) break;
+ // PHP - swallow until ?> or EOF
+ start = find(start + 1, body.end(), '>');
+
+ // look for ?>
+ while (start != body.end() && *(start - 1) != '?')
+ start = find(start + 1, body.end(), '>');
+
+ // unterminated PHP swallows rest of document (rather arbitrarily
+ // but it avoids polluting the database when things go wrong)
+ if (start != body.end()) ++start;
+ } else {
+ // opening or closing tag
+ int closing = 0;
+
+ if (*start == '/') {
+ closing = 1;
+ start = find_if(start + 1, body.end(), p_notwhitespace);
+ }
+
+ p = start;
+ start = find_if(start, body.end(), p_nottag);
+ string tag = body.substr(p - body.begin(), start - p);
+ // convert tagname to lowercase
+ lowercase_string(tag);
+
+ if (closing) {
+ closing_tag(tag);
+ if (in_script && tag == "script") in_script = false;
+
+ /* ignore any bogus parameters on closing tags */
+ p = find(start, body.end(), '>');
+ if (p == body.end()) break;
+ start = p + 1;
+ } else {
+ // FIXME: parse parameters lazily.
+ while (start < body.end() && *start != '>') {
+ string name, value;
+
+ p = find_if(start, body.end(), p_whitespaceeqgt);
+
+ name.assign(body, start - body.begin(), p - start);
+
+ p = find_if(p, body.end(), p_notwhitespace);
+
+ start = p;
+ if (start != body.end() && *start == '=') {
+ start = find_if(start + 1, body.end(), p_notwhitespace);
+
+ p = body.end();
+
+ int quote = *start;
+ if (quote == '"' || quote == '\'') {
+ start++;
+ p = find(start, body.end(), quote);
+ }
+
+ if (p == body.end()) {
+ // unquoted or no closing quote
+ p = find_if(start, body.end(), p_whitespacegt);
+ }
+ value.assign(body, start - body.begin(), p - start);
+ start = find_if(p, body.end(), p_notwhitespace);
+
+ if (!name.empty()) {
+ // convert parameter name to lowercase
+ lowercase_string(name);
+ // in case of multiple entries, use the first
+ // (as Netscape does)
+ parameters.insert(make_pair(name, value));
+ }
+ }
+ }
+#if 0
+ cout << "<" << tag;
+ map<string, string>::const_iterator x;
+ for (x = parameters.begin(); x != parameters.end(); x++) {
+ cout << " " << x->first << "=\"" << x->second << "\"";
+ }
+ cout << ">\n";
+#endif
+ opening_tag(tag);
+ parameters.clear();
+
+ // In <script> tags we ignore opening tags to avoid problems
+ // with "a<b".
+ if (tag == "script") in_script = true;
+
+ if (start != body.end() && *start == '>') ++start;
+ }
+ }
+ }
+}
--- /dev/null
+/* htmlparse.h: simple HTML parser for omega indexer
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2006,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+#ifndef OMEGA_INCLUDED_HTMLPARSE_H
+#define OMEGA_INCLUDED_HTMLPARSE_H
+
+#include <string>
+#include <map>
+
+using std::string;
+using std::map;
+
+namespace zim {
+
+class HtmlParser {
+ map<string, string> parameters;
+ protected:
+ void decode_entities(string &s);
+ bool in_script;
+ string charset;
+ static map<string, unsigned int> named_ents;
+
+ bool get_parameter(const string & param, string & value);
+ public:
+ virtual void process_text(const string &/*text*/) { }
+ virtual void opening_tag(const string &/*tag*/) { }
+ virtual void closing_tag(const string &/*tag*/) { }
+ virtual void parse_html(const string &text);
+ HtmlParser();
+ virtual ~HtmlParser() { }
+};
+
+};
+
+#endif // OMEGA_INCLUDED_HTMLPARSE_H
--- /dev/null
+/* myhtmlparse.cc: subclass of HtmlParser for extracting text.
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2003,2004,2006,2007,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+// #include <config.h>
+
+#include "myhtmlparse.h"
+
+// #include "utf8convert.h"
+
+#include <ctype.h>
+#include <sstream>
+#include <string.h>
+
+inline void lowercase_string(string &str) {
+ for (string::iterator i = str.begin(); i != str.end(); ++i) {
+ *i = tolower(static_cast<unsigned char>(*i));
+ }
+}
+
+void zim::MyHtmlParser::parse_html(const string &text, const string &charset_,
+ bool charset_from_meta_) {
+ charset = charset_;
+ charset_from_meta = charset_from_meta_;
+ HtmlParser::parse_html(text);
+}
+
+void zim::MyHtmlParser::process_text(const string &text) {
+ if (!text.empty() && !in_script_tag && !in_style_tag) {
+ string::size_type b = text.find_first_not_of(WHITESPACE);
+ if (b)
+ pending_space = true;
+ while (b != string::npos) {
+ if (pending_space && !dump.empty())
+ dump += ' ';
+ string::size_type e = text.find_first_of(WHITESPACE, b);
+ pending_space = (e != string::npos);
+ if (!pending_space) {
+ dump.append(text.data() + b, text.size() - b);
+ return;
+ }
+ dump.append(text.data() + b, e - b);
+ b = text.find_first_not_of(WHITESPACE, e + 1);
+ }
+ }
+}
+
+inline float _stof(std::string str) {
+ std::istringstream stream(str);
+ float ret;
+ stream >> ret;
+ return ret;
+}
+
+void zim::MyHtmlParser::opening_tag(const string &tag) {
+ if (tag.empty())
+ return;
+ switch (tag[0]) {
+ case 'a':
+ if (tag == "address")
+ pending_space = true;
+ break;
+ case 'b':
+ if (tag == "body") {
+ dump.resize(0);
+ break;
+ }
+ if (tag == "blockquote" || tag == "br")
+ pending_space = true;
+ break;
+ case 'c':
+ if (tag == "center")
+ pending_space = true;
+ break;
+ case 'd':
+ if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
+ tag == "dt")
+ pending_space = true;
+ break;
+ case 'e':
+ if (tag == "embed")
+ pending_space = true;
+ break;
+ case 'f':
+ if (tag == "fieldset" || tag == "form")
+ pending_space = true;
+ break;
+ case 'h':
+ // hr, and h1, ..., h6
+ if (tag.length() == 2 && strchr("r123456", tag[1]))
+ pending_space = true;
+ break;
+ case 'i':
+ if (tag == "iframe" || tag == "img" || tag == "isindex" || tag == "input")
+ pending_space = true;
+ break;
+ case 'k':
+ if (tag == "keygen")
+ pending_space = true;
+ break;
+ case 'l':
+ if (tag == "legend" || tag == "li" || tag == "listing")
+ pending_space = true;
+ break;
+ case 'm':
+ if (tag == "meta") {
+ string content;
+ if (get_parameter("content", content)) {
+ string name;
+ if (get_parameter("name", name)) {
+ lowercase_string(name);
+ if (name == "description") {
+ if (sample.empty()) {
+ swap(sample, content);
+ // convert_to_utf8(sample, charset);
+ decode_entities(sample);
+ }
+ } else if (name == "keywords") {
+ if (!keywords.empty())
+ keywords += ' ';
+ // convert_to_utf8(content, charset);
+ decode_entities(content);
+ keywords += content;
+ } else if (name == "robots") {
+ decode_entities(content);
+ lowercase_string(content);
+ if (content.find("none") != string::npos ||
+ content.find("noindex") != string::npos) {
+ indexing_allowed = false;
+ throw true;
+ }
+ } else if (name == "geo.position") {
+ auto sep_pos = content.find(";");
+ if (sep_pos != string::npos) {
+ try {
+ latitude = _stof(content.substr(0, sep_pos));
+ longitude = _stof(content.substr(sep_pos + 1));
+ has_geoPosition = true;
+ } catch (...) {
+ // invalid value in content, just pass and continue.
+ }
+ }
+ }
+ break;
+ }
+ // If the current charset came from a meta tag, don't
+ // force reparsing again!
+ if (charset_from_meta)
+ break;
+ string hdr;
+ if (get_parameter("http-equiv", hdr)) {
+ lowercase_string(hdr);
+ if (hdr == "content-type") {
+ lowercase_string(content);
+ size_t start = content.find("charset=");
+ if (start == string::npos)
+ break;
+ start += 8;
+ if (start == content.size())
+ break;
+ size_t end = start;
+ if (content[start] != '"') {
+ while (end < content.size()) {
+ unsigned char ch = content[end];
+ if (ch <= 32 || ch >= 127 || strchr(";()<>@,:\\\"/[]?={}", ch))
+ break;
+ ++end;
+ }
+ } else {
+ ++start;
+ ++end;
+ while (end < content.size()) {
+ unsigned char ch = content[end];
+ if (ch == '"')
+ break;
+ if (ch == '\\')
+ content.erase(end, 1);
+ ++end;
+ }
+ }
+ string newcharset(content, start, end - start);
+ if (charset != newcharset) {
+ throw newcharset;
+ }
+ }
+ }
+ break;
+ }
+ if (charset_from_meta)
+ break;
+ string newcharset;
+ if (get_parameter("charset", newcharset)) {
+ // HTML5 added: <meta charset="...">
+ lowercase_string(newcharset);
+ if (charset != newcharset) {
+ throw newcharset;
+ }
+ }
+ break;
+ }
+ if (tag == "marquee" || tag == "menu" || tag == "multicol")
+ pending_space = true;
+ break;
+ case 'o':
+ if (tag == "ol" || tag == "option")
+ pending_space = true;
+ break;
+ case 'p':
+ if (tag == "p" || tag == "pre" || tag == "plaintext")
+ pending_space = true;
+ break;
+ case 'q':
+ if (tag == "q")
+ pending_space = true;
+ break;
+ case 's':
+ if (tag == "style") {
+ in_style_tag = true;
+ break;
+ }
+ if (tag == "script") {
+ in_script_tag = true;
+ break;
+ }
+ if (tag == "select")
+ pending_space = true;
+ break;
+ case 't':
+ if (tag == "table" || tag == "td" || tag == "textarea" || tag == "th")
+ pending_space = true;
+ break;
+ case 'u':
+ if (tag == "ul")
+ pending_space = true;
+ break;
+ case 'x':
+ if (tag == "xmp")
+ pending_space = true;
+ break;
+ }
+}
+
+void zim::MyHtmlParser::closing_tag(const string &tag) {
+ if (tag.empty())
+ return;
+ switch (tag[0]) {
+ case 'a':
+ if (tag == "address")
+ pending_space = true;
+ break;
+ case 'b':
+ if (tag == "body") {
+ throw true;
+ }
+ if (tag == "blockquote" || tag == "br")
+ pending_space = true;
+ break;
+ case 'c':
+ if (tag == "center")
+ pending_space = true;
+ break;
+ case 'd':
+ if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
+ tag == "dt")
+ pending_space = true;
+ break;
+ case 'f':
+ if (tag == "fieldset" || tag == "form")
+ pending_space = true;
+ break;
+ case 'h':
+ // hr, and h1, ..., h6
+ if (tag.length() == 2 && strchr("r123456", tag[1]))
+ pending_space = true;
+ break;
+ case 'i':
+ if (tag == "iframe")
+ pending_space = true;
+ break;
+ case 'l':
+ if (tag == "legend" || tag == "li" || tag == "listing")
+ pending_space = true;
+ break;
+ case 'm':
+ if (tag == "marquee" || tag == "menu")
+ pending_space = true;
+ break;
+ case 'o':
+ if (tag == "ol" || tag == "option")
+ pending_space = true;
+ break;
+ case 'p':
+ if (tag == "p" || tag == "pre")
+ pending_space = true;
+ break;
+ case 'q':
+ if (tag == "q")
+ pending_space = true;
+ break;
+ case 's':
+ if (tag == "style") {
+ in_style_tag = false;
+ break;
+ }
+ if (tag == "script") {
+ in_script_tag = false;
+ break;
+ }
+ if (tag == "select")
+ pending_space = true;
+ break;
+ case 't':
+ if (tag == "title") {
+ if (title.empty())
+ swap(title, dump);
+ break;
+ }
+ if (tag == "table" || tag == "td" || tag == "textarea" || tag == "th")
+ pending_space = true;
+ break;
+ case 'u':
+ if (tag == "ul")
+ pending_space = true;
+ break;
+ case 'x':
+ if (tag == "xmp")
+ pending_space = true;
+ break;
+ }
+}
--- /dev/null
+/* myhtmlparse.h: subclass of HtmlParser for extracting text
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2003,2004,2006,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301
+ * USA
+ */
+
+#ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
+#define OMEGA_INCLUDED_MYHTMLPARSE_H
+
+#include "htmlparse.h"
+
+// FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
+// not in all charsets and perhaps spans of all \xa0 should become a single
+// \xa0?
+#define WHITESPACE " \t\n\r"
+
+namespace zim {
+
+class MyHtmlParser : public HtmlParser {
+ public:
+ bool in_script_tag;
+ bool in_style_tag;
+ bool pending_space;
+ bool indexing_allowed;
+ bool charset_from_meta;
+ float latitude, longitude;
+ bool has_geoPosition;
+ string title, sample, keywords, dump;
+ void process_text(const string &text);
+ void opening_tag(const string &tag);
+ void closing_tag(const string &tag);
+ using HtmlParser::parse_html;
+ void parse_html(const string &text, const string &charset_,
+ bool charset_from_meta_);
+ MyHtmlParser() :
+ in_script_tag(false),
+ in_style_tag(false),
+ pending_space(false),
+ indexing_allowed(true),
+ charset_from_meta(false),
+ latitude(0), longitude(0), has_geoPosition(false) { }
+
+ void reset() {
+ in_script_tag = false;
+ in_style_tag = false;
+ pending_space = false;
+ indexing_allowed = true;
+ charset_from_meta = false;
+ latitude = longitude = 0;
+ has_geoPosition = false;
+ title.resize(0);
+ sample.resize(0);
+ keywords.resize(0);
+ dump.resize(0);
+ }
+};
+
+};
+
+#endif // OMEGA_INCLUDED_MYHTMLPARSE_H
--- /dev/null
+/* namedentities.h: named HTML entities.
+ *
+ * Copyright (C) 2006,2007 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef OMEGA_INCLUDED_NAMEDENTITIES_H
+#define OMEGA_INCLUDED_NAMEDENTITIES_H
+
+// Names and values from: "Character entity references in HTML 4"
+// http://www.w3.org/TR/html4/sgml/entities.html
+{ "quot", 34 },
+{ "amp", 38 },
+{ "apos", 39 }, // Not in HTML 4 list but used in OpenOffice XML.
+{ "lt", 60 },
+{ "gt", 62 },
+{ "nbsp", 160 },
+{ "iexcl", 161 },
+{ "cent", 162 },
+{ "pound", 163 },
+{ "curren", 164 },
+{ "yen", 165 },
+{ "brvbar", 166 },
+{ "sect", 167 },
+{ "uml", 168 },
+{ "copy", 169 },
+{ "ordf", 170 },
+{ "laquo", 171 },
+{ "not", 172 },
+{ "shy", 173 },
+{ "reg", 174 },
+{ "macr", 175 },
+{ "deg", 176 },
+{ "plusmn", 177 },
+{ "sup2", 178 },
+{ "sup3", 179 },
+{ "acute", 180 },
+{ "micro", 181 },
+{ "para", 182 },
+{ "middot", 183 },
+{ "cedil", 184 },
+{ "sup1", 185 },
+{ "ordm", 186 },
+{ "raquo", 187 },
+{ "frac14", 188 },
+{ "frac12", 189 },
+{ "frac34", 190 },
+{ "iquest", 191 },
+{ "Agrave", 192 },
+{ "Aacute", 193 },
+{ "Acirc", 194 },
+{ "Atilde", 195 },
+{ "Auml", 196 },
+{ "Aring", 197 },
+{ "AElig", 198 },
+{ "Ccedil", 199 },
+{ "Egrave", 200 },
+{ "Eacute", 201 },
+{ "Ecirc", 202 },
+{ "Euml", 203 },
+{ "Igrave", 204 },
+{ "Iacute", 205 },
+{ "Icirc", 206 },
+{ "Iuml", 207 },
+{ "ETH", 208 },
+{ "Ntilde", 209 },
+{ "Ograve", 210 },
+{ "Oacute", 211 },
+{ "Ocirc", 212 },
+{ "Otilde", 213 },
+{ "Ouml", 214 },
+{ "times", 215 },
+{ "Oslash", 216 },
+{ "Ugrave", 217 },
+{ "Uacute", 218 },
+{ "Ucirc", 219 },
+{ "Uuml", 220 },
+{ "Yacute", 221 },
+{ "THORN", 222 },
+{ "szlig", 223 },
+{ "agrave", 224 },
+{ "aacute", 225 },
+{ "acirc", 226 },
+{ "atilde", 227 },
+{ "auml", 228 },
+{ "aring", 229 },
+{ "aelig", 230 },
+{ "ccedil", 231 },
+{ "egrave", 232 },
+{ "eacute", 233 },
+{ "ecirc", 234 },
+{ "euml", 235 },
+{ "igrave", 236 },
+{ "iacute", 237 },
+{ "icirc", 238 },
+{ "iuml", 239 },
+{ "eth", 240 },
+{ "ntilde", 241 },
+{ "ograve", 242 },
+{ "oacute", 243 },
+{ "ocirc", 244 },
+{ "otilde", 245 },
+{ "ouml", 246 },
+{ "divide", 247 },
+{ "oslash", 248 },
+{ "ugrave", 249 },
+{ "uacute", 250 },
+{ "ucirc", 251 },
+{ "uuml", 252 },
+{ "yacute", 253 },
+{ "thorn", 254 },
+{ "yuml", 255 },
+{ "OElig", 338 },
+{ "oelig", 339 },
+{ "Scaron", 352 },
+{ "scaron", 353 },
+{ "Yuml", 376 },
+{ "fnof", 402 },
+{ "circ", 710 },
+{ "tilde", 732 },
+{ "Alpha", 913 },
+{ "Beta", 914 },
+{ "Gamma", 915 },
+{ "Delta", 916 },
+{ "Epsilon", 917 },
+{ "Zeta", 918 },
+{ "Eta", 919 },
+{ "Theta", 920 },
+{ "Iota", 921 },
+{ "Kappa", 922 },
+{ "Lambda", 923 },
+{ "Mu", 924 },
+{ "Nu", 925 },
+{ "Xi", 926 },
+{ "Omicron", 927 },
+{ "Pi", 928 },
+{ "Rho", 929 },
+{ "Sigma", 931 },
+{ "Tau", 932 },
+{ "Upsilon", 933 },
+{ "Phi", 934 },
+{ "Chi", 935 },
+{ "Psi", 936 },
+{ "Omega", 937 },
+{ "alpha", 945 },
+{ "beta", 946 },
+{ "gamma", 947 },
+{ "delta", 948 },
+{ "epsilon", 949 },
+{ "zeta", 950 },
+{ "eta", 951 },
+{ "theta", 952 },
+{ "iota", 953 },
+{ "kappa", 954 },
+{ "lambda", 955 },
+{ "mu", 956 },
+{ "nu", 957 },
+{ "xi", 958 },
+{ "omicron", 959 },
+{ "pi", 960 },
+{ "rho", 961 },
+{ "sigmaf", 962 },
+{ "sigma", 963 },
+{ "tau", 964 },
+{ "upsilon", 965 },
+{ "phi", 966 },
+{ "chi", 967 },
+{ "psi", 968 },
+{ "omega", 969 },
+{ "thetasym", 977 },
+{ "upsih", 978 },
+{ "piv", 982 },
+{ "ensp", 8194 },
+{ "emsp", 8195 },
+{ "thinsp", 8201 },
+{ "zwnj", 8204 },
+{ "zwj", 8205 },
+{ "lrm", 8206 },
+{ "rlm", 8207 },
+{ "ndash", 8211 },
+{ "mdash", 8212 },
+{ "lsquo", 8216 },
+{ "rsquo", 8217 },
+{ "sbquo", 8218 },
+{ "ldquo", 8220 },
+{ "rdquo", 8221 },
+{ "bdquo", 8222 },
+{ "dagger", 8224 },
+{ "Dagger", 8225 },
+{ "bull", 8226 },
+{ "hellip", 8230 },
+{ "permil", 8240 },
+{ "prime", 8242 },
+{ "Prime", 8243 },
+{ "lsaquo", 8249 },
+{ "rsaquo", 8250 },
+{ "oline", 8254 },
+{ "frasl", 8260 },
+{ "euro", 8364 },
+{ "image", 8465 },
+{ "weierp", 8472 },
+{ "real", 8476 },
+{ "trade", 8482 },
+{ "alefsym", 8501 },
+{ "larr", 8592 },
+{ "uarr", 8593 },
+{ "rarr", 8594 },
+{ "darr", 8595 },
+{ "harr", 8596 },
+{ "crarr", 8629 },
+{ "lArr", 8656 },
+{ "uArr", 8657 },
+{ "rArr", 8658 },
+{ "dArr", 8659 },
+{ "hArr", 8660 },
+{ "forall", 8704 },
+{ "part", 8706 },
+{ "exist", 8707 },
+{ "empty", 8709 },
+{ "nabla", 8711 },
+{ "isin", 8712 },
+{ "notin", 8713 },
+{ "ni", 8715 },
+{ "prod", 8719 },
+{ "sum", 8721 },
+{ "minus", 8722 },
+{ "lowast", 8727 },
+{ "radic", 8730 },
+{ "prop", 8733 },
+{ "infin", 8734 },
+{ "ang", 8736 },
+{ "and", 8743 },
+{ "or", 8744 },
+{ "cap", 8745 },
+{ "cup", 8746 },
+{ "int", 8747 },
+{ "there4", 8756 },
+{ "sim", 8764 },
+{ "cong", 8773 },
+{ "asymp", 8776 },
+{ "ne", 8800 },
+{ "equiv", 8801 },
+{ "le", 8804 },
+{ "ge", 8805 },
+{ "sub", 8834 },
+{ "sup", 8835 },
+{ "nsub", 8836 },
+{ "sube", 8838 },
+{ "supe", 8839 },
+{ "oplus", 8853 },
+{ "otimes", 8855 },
+{ "perp", 8869 },
+{ "sdot", 8901 },
+{ "lceil", 8968 },
+{ "rceil", 8969 },
+{ "lfloor", 8970 },
+{ "rfloor", 8971 },
+{ "lang", 9001 },
+{ "rang", 9002 },
+{ "loz", 9674 },
+{ "spades", 9824 },
+{ "clubs", 9827 },
+{ "hearts", 9829 },
+{ "diams", 9830 },
+
+#endif // OMEGA_INCLUDED_NAMEDENTITIES_H
--- /dev/null
+/*
+ * Copyright (C) 2018-2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+
+#ifndef ZIM_TYPES_H
+#define ZIM_TYPES_H
+
+#include <zim/zim.h>
+
+#include <ostream>
+
+#ifdef __GNUC__
+#define PACKED __attribute__((packed))
+#else
+#define PACKED
+#endif
+
+
+template<typename B>
+struct REAL_TYPEDEF{
+ typedef B base_type;
+ B v;
+ REAL_TYPEDEF() : v(0) {};
+ explicit REAL_TYPEDEF(B v) : v(v) {};
+ explicit inline operator bool() const { return v != 0; }
+ explicit inline operator B() const { return v; }
+
+ inline bool operator==(const REAL_TYPEDEF<B>& rhs) const
+ { return v == rhs.v; }
+
+ inline REAL_TYPEDEF<B>& operator++()
+ { v++; return *this; }
+
+ inline REAL_TYPEDEF<B> operator++(int)
+ { return REAL_TYPEDEF<B>(v++); }
+
+} PACKED;
+
+template<typename T> inline T& operator+= (T& lhs, const T& rhs)
+{
+ lhs.v += rhs.v;
+ return lhs;
+}
+
+template<typename T> inline T& operator+= (T& lhs, const typename T::base_type& rhs)
+{
+ lhs.v += rhs;
+ return lhs;
+}
+
+template<typename T> inline T operator+(T lhs, const T& rhs)
+{
+ lhs += rhs;
+ return lhs;
+}
+
+template<typename T> inline T& operator-=(T& lhs, const T& rhs)
+{
+ lhs.v -= rhs.v;
+ return lhs;
+}
+
+template<typename T> inline T operator-(T lhs, const T& rhs)
+{
+ lhs -= rhs;
+ return lhs;
+}
+
+template<typename T> inline bool operator< (const T& lhs, const T& rhs)
+{ return lhs.v < rhs.v; }
+
+template<typename T> inline bool operator> (const T& lhs, const T& rhs)
+{ return rhs < lhs; }
+
+template<typename T> inline bool operator<=(const T& lhs, const T& rhs)
+{ return !(lhs > rhs); }
+
+template<typename T> inline bool operator>=(const T& lhs, const T& rhs)
+{ return !(lhs < rhs); }
+
+template<typename T> inline bool operator!=(const T& lhs, const T& rhs)
+{ return !(lhs == rhs); }
+
+
+template<typename B>
+std::ostream& operator<<(std::ostream& os, const REAL_TYPEDEF<B>& obj)
+{
+ os << obj.v;
+ return os;
+}
+
+namespace zim {
+
+#define TYPEDEF(NAME, TYPE) struct NAME : public REAL_TYPEDEF<TYPE> { \
+explicit NAME(TYPE v=0) : REAL_TYPEDEF<TYPE>(v) {}; } PACKED; \
+static_assert(sizeof(NAME) == sizeof(TYPE), "");
+
+TYPEDEF(entry_index_t, entry_index_type)
+TYPEDEF(title_index_t, entry_index_type)
+TYPEDEF(cluster_index_t, cluster_index_type)
+TYPEDEF(blob_index_t, blob_index_type)
+
+TYPEDEF(zsize_t, size_type)
+TYPEDEF(offset_t, offset_type)
+
+#undef TYPEDEF
+
+inline offset_t& operator+= (offset_t& lhs, const zsize_t& rhs)
+{
+ lhs.v += rhs.v;
+ return lhs;
+}
+
+inline offset_t operator+(offset_t lhs, const zsize_t& rhs)
+{
+ lhs += rhs;
+ return lhs;
+}
+
+};
+
+#endif //ZIM_TYPES_H
--- /dev/null
+
+resources_list = 'resources_list.txt'
+
+lib_resources = custom_target('resources',
+ input: resources_list,
+ output: ['libzim-resources.cpp', 'libzim-resources.h'],
+ command:[res_compiler,
+ '--cxxfile', '@OUTPUT0@',
+ '--hfile', '@OUTPUT1@',
+ '--source_dir', '@OUTDIR@',
+ '@INPUT@']
+)
--- /dev/null
+stopwords/af
+stopwords/ar
+stopwords/bg
+stopwords/bn
+stopwords/br
+stopwords/ca
+stopwords/cs
+stopwords/da
+stopwords/de
+stopwords/el
+stopwords/en
+stopwords/eo
+stopwords/es
+stopwords/et
+stopwords/eu
+stopwords/fa
+stopwords/fi
+stopwords/fr
+stopwords/ga
+stopwords/gl
+stopwords/gu
+stopwords/ha
+stopwords/he
+stopwords/hi
+stopwords/hr
+stopwords/hu
+stopwords/hy
+stopwords/id
+stopwords/it
+stopwords/ja
+stopwords/ko
+stopwords/ku
+stopwords/la
+stopwords/lt
+stopwords/lv
+stopwords/mr
+stopwords/ms
+stopwords/nl
+stopwords/no
+stopwords/pl
+stopwords/pt
+stopwords/ro
+stopwords/ru
+stopwords/sk
+stopwords/sl
+stopwords/so
+stopwords/st
+stopwords/sv
+stopwords/sw
+stopwords/th
+stopwords/tl
+stopwords/tr
+stopwords/uk
+stopwords/ur
+stopwords/vi
+stopwords/yo
+stopwords/zh
+stopwords/zu
\ No newline at end of file
--- /dev/null
+'n
+aan
+af
+al
+as
+baie
+by
+daar
+dag
+dat
+die
+dit
+een
+ek
+en
+gaan
+gesê
+haar
+het
+hom
+hulle
+hy
+in
+is
+jou
+jy
+kan
+kom
+ma
+maar
+met
+my
+na
+nie
+om
+ons
+op
+saam
+sal
+se
+sien
+so
+sy
+te
+toe
+uit
+van
+vir
+was
+wat
+ʼn
\ No newline at end of file
--- /dev/null
+،
+آض
+آمينَ
+آه
+آهاً
+آي
+أ
+أب
+أجل
+أجمع
+أخ
+أخذ
+أصبح
+أضحى
+أقبل
+أقل
+أكثر
+ألا
+أم
+أما
+أمامك
+أمامكَ
+أمسى
+أمّا
+أن
+أنا
+أنت
+أنتم
+أنتما
+أنتن
+أنتِ
+أنشأ
+أنّى
+أو
+أوشك
+أولئك
+أولئكم
+أولاء
+أولالك
+أوّهْ
+أي
+أيا
+أين
+أينما
+أيّ
+أَنَّ
+أََيُّ
+أُفٍّ
+إذ
+إذا
+إذاً
+إذما
+إذن
+إلى
+إليكم
+إليكما
+إليكنّ
+إليكَ
+إلَيْكَ
+إلّا
+إمّا
+إن
+إنّما
+إي
+إياك
+إياكم
+إياكما
+إياكن
+إيانا
+إياه
+إياها
+إياهم
+إياهما
+إياهن
+إياي
+إيهٍ
+إِنَّ
+ا
+ابتدأ
+اثر
+اجل
+احد
+اخرى
+اخلولق
+اذا
+اربعة
+ارتدّ
+استحال
+اطار
+اعادة
+اعلنت
+اف
+اكثر
+اكد
+الألاء
+الألى
+الا
+الاخيرة
+الان
+الاول
+الاولى
+التى
+التي
+الثاني
+الثانية
+الذاتي
+الذى
+الذي
+الذين
+السابق
+الف
+اللائي
+اللاتي
+اللتان
+اللتيا
+اللتين
+اللذان
+اللذين
+اللواتي
+الماضي
+المقبل
+الوقت
+الى
+اليوم
+اما
+امام
+امس
+ان
+انبرى
+انقلب
+انه
+انها
+او
+اول
+اي
+ايار
+ايام
+ايضا
+ب
+بات
+باسم
+بان
+بخٍ
+برس
+بسبب
+بسّ
+بشكل
+بضع
+بطآن
+بعد
+بعض
+بك
+بكم
+بكما
+بكن
+بل
+بلى
+بما
+بماذا
+بمن
+بن
+بنا
+به
+بها
+بي
+بيد
+بين
+بَسْ
+بَلْهَ
+بِئْسَ
+تانِ
+تانِك
+تبدّل
+تجاه
+تحوّل
+تلقاء
+تلك
+تلكم
+تلكما
+تم
+تينك
+تَيْنِ
+تِه
+تِي
+ثلاثة
+ثم
+ثمّ
+ثمّة
+ثُمَّ
+جعل
+جلل
+جميع
+جير
+حار
+حاشا
+حاليا
+حاي
+حتى
+حرى
+حسب
+حم
+حوالى
+حول
+حيث
+حيثما
+حين
+حيَّ
+حَبَّذَا
+حَتَّى
+حَذارِ
+خلا
+خلال
+دون
+دونك
+ذا
+ذات
+ذاك
+ذانك
+ذانِ
+ذلك
+ذلكم
+ذلكما
+ذلكن
+ذو
+ذوا
+ذواتا
+ذواتي
+ذيت
+ذينك
+ذَيْنِ
+ذِه
+ذِي
+راح
+رجع
+رويدك
+ريث
+رُبَّ
+زيارة
+سبحان
+سرعان
+سنة
+سنوات
+سوف
+سوى
+سَاءَ
+سَاءَمَا
+شبه
+شخصا
+شرع
+شَتَّانَ
+صار
+صباح
+صفر
+صهٍ
+صهْ
+ضد
+ضمن
+طاق
+طالما
+طفق
+طَق
+ظلّ
+عاد
+عام
+عاما
+عامة
+عدا
+عدة
+عدد
+عدم
+عسى
+عشر
+عشرة
+علق
+على
+عليك
+عليه
+عليها
+علًّ
+عن
+عند
+عندما
+عوض
+عين
+عَدَسْ
+عَمَّا
+غدا
+غير
+ـ
+ف
+فان
+فلان
+فو
+فى
+في
+فيم
+فيما
+فيه
+فيها
+قال
+قام
+قبل
+قد
+قطّ
+قلما
+قوة
+كأنّما
+كأين
+كأيّ
+كأيّن
+كاد
+كان
+كانت
+كذا
+كذلك
+كرب
+كل
+كلا
+كلاهما
+كلتا
+كلم
+كليكما
+كليهما
+كلّما
+كلَّا
+كم
+كما
+كي
+كيت
+كيف
+كيفما
+كَأَنَّ
+كِخ
+لئن
+لا
+لات
+لاسيما
+لدن
+لدى
+لعمر
+لقاء
+لك
+لكم
+لكما
+لكن
+لكنَّما
+لكي
+لكيلا
+للامم
+لم
+لما
+لمّا
+لن
+لنا
+له
+لها
+لو
+لوكالة
+لولا
+لوما
+لي
+لَسْتَ
+لَسْتُ
+لَسْتُم
+لَسْتُمَا
+لَسْتُنَّ
+لَسْتِ
+لَسْنَ
+لَعَلَّ
+لَكِنَّ
+لَيْتَ
+لَيْسَ
+لَيْسَا
+لَيْسَتَا
+لَيْسَتْ
+لَيْسُوا
+لَِسْنَا
+ما
+ماانفك
+مابرح
+مادام
+ماذا
+مازال
+مافتئ
+مايو
+متى
+مثل
+مذ
+مساء
+مع
+معاذ
+مقابل
+مكانكم
+مكانكما
+مكانكنّ
+مكانَك
+مليار
+مليون
+مما
+ممن
+من
+منذ
+منها
+مه
+مهما
+مَنْ
+مِن
+نحن
+نحو
+نعم
+نفس
+نفسه
+نهاية
+نَخْ
+نِعِمّا
+نِعْمَ
+ها
+هاؤم
+هاكَ
+هاهنا
+هبّ
+هذا
+هذه
+هكذا
+هل
+هلمَّ
+هلّا
+هم
+هما
+هن
+هنا
+هناك
+هنالك
+هو
+هي
+هيا
+هيت
+هيّا
+هَؤلاء
+هَاتانِ
+هَاتَيْنِ
+هَاتِه
+هَاتِي
+هَجْ
+هَذا
+هَذانِ
+هَذَيْنِ
+هَذِه
+هَذِي
+هَيْهَاتَ
+و
+و6
+وا
+واحد
+واضاف
+واضافت
+واكد
+وان
+واهاً
+واوضح
+وراءَك
+وفي
+وقال
+وقالت
+وقد
+وقف
+وكان
+وكانت
+ولا
+ولم
+ومن
+وهو
+وهي
+ويكأنّ
+وَيْ
+وُشْكَانََ
+يكون
+يمكن
+يوم
+ّأيّان
\ No newline at end of file
--- /dev/null
+а
+автентичен
+аз
+ако
+ала
+бе
+без
+беше
+би
+бивш
+бивша
+бившо
+бил
+била
+били
+било
+благодаря
+близо
+бъдат
+бъде
+бяха
+в
+вас
+ваш
+ваша
+вероятно
+вече
+взема
+ви
+вие
+винаги
+внимава
+време
+все
+всеки
+всички
+всичко
+всяка
+във
+въпреки
+върху
+г
+ги
+главен
+главна
+главно
+глас
+го
+година
+години
+годишен
+д
+да
+дали
+два
+двама
+двамата
+две
+двете
+ден
+днес
+дни
+до
+добра
+добре
+добро
+добър
+докато
+докога
+дори
+досега
+доста
+друг
+друга
+други
+е
+евтин
+едва
+един
+една
+еднаква
+еднакви
+еднакъв
+едно
+екип
+ето
+живот
+за
+забавям
+зад
+заедно
+заради
+засега
+заспал
+затова
+защо
+защото
+и
+из
+или
+им
+има
+имат
+иска
+й
+каза
+как
+каква
+какво
+както
+какъв
+като
+кога
+когато
+което
+които
+кой
+който
+колко
+която
+къде
+където
+към
+лесен
+лесно
+ли
+лош
+м
+май
+малко
+ме
+между
+мек
+мен
+месец
+ми
+много
+мнозина
+мога
+могат
+може
+мокър
+моля
+момента
+му
+н
+на
+над
+назад
+най
+направи
+напред
+например
+нас
+не
+него
+нещо
+нея
+ни
+ние
+никой
+нито
+нищо
+но
+нов
+нова
+нови
+новина
+някои
+някой
+няколко
+няма
+обаче
+около
+освен
+особено
+от
+отгоре
+отново
+още
+пак
+по
+повече
+повечето
+под
+поне
+поради
+после
+почти
+прави
+пред
+преди
+през
+при
+пък
+първата
+първи
+първо
+пъти
+равен
+равна
+с
+са
+сам
+само
+се
+сега
+си
+син
+скоро
+след
+следващ
+сме
+смях
+според
+сред
+срещу
+сте
+съм
+със
+също
+т
+т.н.
+тази
+така
+такива
+такъв
+там
+твой
+те
+тези
+ти
+то
+това
+тогава
+този
+той
+толкова
+точно
+три
+трябва
+тук
+тъй
+тя
+тях
+у
+утре
+харесва
+хиляди
+ч
+часа
+че
+често
+чрез
+ще
+щом
+юмрук
+я
+як
\ No newline at end of file
--- /dev/null
+অতএব
+অথচ
+অথবা
+অনুযায়ী
+অনেক
+অনেকে
+অনেকেই
+অন্তত
+অন্য
+অবধি
+অবশ্য
+অর্থাত
+আই
+আগামী
+আগে
+আগেই
+আছে
+আজ
+আদ্যভাগে
+আপনার
+আপনি
+আবার
+আমরা
+আমাকে
+আমাদের
+আমার
+আমি
+আর
+আরও
+ই
+ইত্যাদি
+ইহা
+উচিত
+উত্তর
+উনি
+উপর
+উপরে
+এ
+এঁদের
+এঁরা
+এই
+একই
+একটি
+একবার
+একে
+এক্
+এখন
+এখনও
+এখানে
+এখানেই
+এটা
+এটাই
+এটি
+এত
+এতটাই
+এতে
+এদের
+এব
+এবং
+এবার
+এমন
+এমনকী
+এমনি
+এর
+এরা
+এল
+এস
+এসে
+ঐ
+ও
+ওঁদের
+ওঁর
+ওঁরা
+ওই
+ওকে
+ওখানে
+ওদের
+ওর
+ওরা
+কখনও
+কত
+কবে
+কমনে
+কয়েক
+কয়েকটি
+করছে
+করছেন
+করতে
+করবে
+করবেন
+করলে
+করলেন
+করা
+করাই
+করায়
+করার
+করি
+করিতে
+করিয়া
+করিয়ে
+করে
+করেই
+করেছিলেন
+করেছে
+করেছেন
+করেন
+কাউকে
+কাছ
+কাছে
+কাজ
+কাজে
+কারও
+কারণ
+কি
+কিংবা
+কিছু
+কিছুই
+কিন্তু
+কী
+কে
+কেউ
+কেউই
+কেখা
+কেন
+কোটি
+কোন
+কোনও
+কোনো
+ক্ষেত্রে
+কয়েক
+খুব
+গিয়ে
+গিয়েছে
+গিয়ে
+গুলি
+গেছে
+গেল
+গেলে
+গোটা
+চলে
+চান
+চায়
+চার
+চালু
+চেয়ে
+চেষ্টা
+ছাড়া
+ছাড়াও
+ছিল
+ছিলেন
+জন
+জনকে
+জনের
+জন্য
+জন্যওজে
+জানতে
+জানা
+জানানো
+জানায়
+জানিয়ে
+জানিয়েছে
+জে
+জ্নজন
+টি
+ঠিক
+তখন
+তত
+তথা
+তবু
+তবে
+তা
+তাঁকে
+তাঁদের
+তাঁর
+তাঁরা
+তাঁাহারা
+তাই
+তাও
+তাকে
+তাতে
+তাদের
+তার
+তারপর
+তারা
+তারৈ
+তাহলে
+তাহা
+তাহাতে
+তাহার
+তিনঐ
+তিনি
+তিনিও
+তুমি
+তুলে
+তেমন
+তো
+তোমার
+থাকবে
+থাকবেন
+থাকা
+থাকায়
+থাকে
+থাকেন
+থেকে
+থেকেই
+থেকেও
+দিকে
+দিতে
+দিন
+দিয়ে
+দিয়েছে
+দিয়েছেন
+দিলেন
+দু
+দুই
+দুটি
+দুটো
+দেওয়া
+দেওয়ার
+দেওয়া
+দেখতে
+দেখা
+দেখে
+দেন
+দেয়
+দ্বারা
+ধরা
+ধরে
+ধামার
+নতুন
+নয়
+না
+নাই
+নাকি
+নাগাদ
+নানা
+নিজে
+নিজেই
+নিজেদের
+নিজের
+নিতে
+নিয়ে
+নিয়ে
+নেই
+নেওয়া
+নেওয়ার
+নেওয়া
+নয়
+পক্ষে
+পর
+পরে
+পরেই
+পরেও
+পর্যন্ত
+পাওয়া
+পাচ
+পারি
+পারে
+পারেন
+পি
+পেয়ে
+পেয়্র্
+প্রতি
+প্রথম
+প্রভৃতি
+প্রযন্ত
+প্রাথমিক
+প্রায়
+প্রায়
+ফলে
+ফিরে
+ফের
+বক্তব্য
+বদলে
+বন
+বরং
+বলতে
+বলল
+বললেন
+বলা
+বলে
+বলেছেন
+বলেন
+বসে
+বহু
+বা
+বাদে
+বার
+বি
+বিনা
+বিভিন্ন
+বিশেষ
+বিষয়টি
+বেশ
+বেশি
+ব্যবহার
+ব্যাপারে
+ভাবে
+ভাবেই
+মতো
+মতোই
+মধ্যভাগে
+মধ্যে
+মধ্যেই
+মধ্যেও
+মনে
+মাত্র
+মাধ্যমে
+মোট
+মোটেই
+যখন
+যত
+যতটা
+যথেষ্ট
+যদি
+যদিও
+যা
+যাঁর
+যাঁরা
+যাওয়া
+যাওয়ার
+যাওয়া
+যাকে
+যাচ্ছে
+যাতে
+যাদের
+যান
+যাবে
+যায়
+যার
+যারা
+যিনি
+যে
+যেখানে
+যেতে
+যেন
+যেমন
+র
+রকম
+রয়েছে
+রাখা
+রেখে
+লক্ষ
+শুধু
+শুরু
+সঙ্গে
+সঙ্গেও
+সব
+সবার
+সমস্ত
+সম্প্রতি
+সহ
+সহিত
+সাধারণ
+সামনে
+সি
+সুতরাং
+সে
+সেই
+সেখান
+সেখানে
+সেটা
+সেটাই
+সেটাও
+সেটি
+স্পষ্ট
+স্বয়ং
+হইতে
+হইবে
+হইয়া
+হওয়া
+হওয়ায়
+হওয়ার
+হচ্ছে
+হত
+হতে
+হতেই
+হন
+হবে
+হবেন
+হয়
+হয়তো
+হয়নি
+হয়ে
+হয়েই
+হয়েছিল
+হয়েছে
+হয়েছেন
+হল
+হলে
+হলেই
+হলেও
+হলো
+হাজার
+হিসাবে
+হৈলে
+হোক
+হয়
\ No newline at end of file
--- /dev/null
+'blam
+'d
+'m
+'r
+'ta
+'vat
+'z
+'zo
+a
+a:
+aba
+abalamour
+abaoe
+ac'hane
+ac'hanoc'h
+ac'hanomp
+ac'hanon
+ac'hanout
+adal
+adalek
+adarre
+ae
+aec'h
+aed
+aemp
+aen
+aent
+aes
+afe
+afec'h
+afed
+afemp
+afen
+afent
+afes
+ag
+ah
+aimp
+aint
+aio
+aiou
+aje
+ajec'h
+ajed
+ajemp
+ajen
+ajent
+ajes
+al
+alato
+alies
+aliesañ
+alkent
+all
+allas
+allo
+allô
+am
+amañ
+amzer
+an
+anezhañ
+anezhe
+anezhi
+anezho
+anvet
+aon
+aotren
+ar
+arall
+araok
+araoki
+araozañ
+araozo
+araozoc'h
+araozomp
+araozon
+araozor
+araozout
+arbenn
+arre
+atalek
+atav
+az
+azalek
+azirazañ
+azirazi
+azirazo
+azirazoc'h
+azirazomp
+azirazon
+azirazor
+azirazout
+b:
+ba
+ba'l
+ba'n
+ba'r
+bad
+bah
+bal
+ban
+bar
+bastañ
+befe
+bell
+benaos
+benn
+bennag
+bennak
+bennozh
+bep
+bepred
+berr
+berzh
+bet
+betek
+betra
+bev
+bevet
+bez
+bezañ
+beze
+bezent
+bezet
+bezh
+bezit
+bezomp
+bihan
+bije
+biou
+biskoazh
+blam
+bo
+boa
+bominapl
+boudoudom
+bouez
+boull
+boum
+bout
+bras
+brasañ
+brav
+bravo
+bremañ
+bres
+brokenn
+bronn
+brrr
+brutal
+buhezek
+c'h:
+c'haout
+c'he
+c'hem
+c'herz
+c'heñver
+c'hichen
+c'hiz
+c'hoazh
+c'horre
+c'houde
+c'houst
+c'hreiz
+c'hwec'h
+c'hwec'hvet
+c'hwezek
+c'hwi
+ch:
+chaous
+chik
+chit
+chom
+chut
+d'
+d'al
+d'an
+d'ar
+d'az
+d'e
+d'he
+d'ho
+d'hol
+d'hon
+d'hor
+d'o
+d'ober
+d'ul
+d'un
+d'ur
+d:
+da
+dak
+daka
+dal
+dalbezh
+dalc'hmat
+dalit
+damdost
+damheñvel
+damm
+dan
+danvez
+dao
+daol
+daonet
+daou
+daoust
+daouzek
+daouzekvet
+darn
+dastrewiñ
+dav
+davedoc'h
+davedomp
+davedon
+davedor
+davedout
+davet
+davetañ
+davete
+daveti
+daveto
+defe
+dehou
+dek
+dekvet
+den
+deoc'h
+deomp
+deor
+derc'hel
+deus
+dez
+deze
+dezhañ
+dezhe
+dezhi
+dezho
+di
+diabarzh
+diagent
+diar
+diaraok
+diavaez
+dibaoe
+dibaot
+dibar
+dic'halañ
+didiac'h
+dienn
+difer
+diganeoc'h
+diganeomp
+diganeor
+diganimp
+diganin
+diganit
+digant
+digantañ
+digante
+diganti
+diganto
+digemmesk
+diget
+digor
+digoret
+dija
+dije
+dimp
+din
+dinaou
+dindan
+dindanañ
+dindani
+dindano
+dindanoc'h
+dindanomp
+dindanon
+dindanor
+dindanout
+dioutañ
+dioute
+diouti
+diouto
+diouzh
+diouzhin
+diouzhit
+diouzhoc'h
+diouzhomp
+diouzhor
+dirak
+dirazañ
+dirazi
+dirazo
+dirazoc'h
+dirazomp
+dirazon
+dirazor
+dirazout
+disheñvel
+dispar
+distank
+dister
+disterañ
+disterig
+distro
+dit
+divaez
+diwar
+diwezhat
+diwezhañ
+do
+doa
+doare
+dont
+dost
+doue
+douetus
+douez
+doug
+draou
+draoñ
+dre
+drede
+dreist
+dreistañ
+dreisti
+dreisto
+dreistoc'h
+dreistomp
+dreiston
+dreistor
+dreistout
+drek
+dreñv
+dring
+dro
+du
+e
+e:
+eas
+ebet
+ec'h
+edo
+edoc'h
+edod
+edomp
+edon
+edont
+edos
+eer
+eeun
+efed
+egedoc'h
+egedomp
+egedon
+egedor
+egedout
+eget
+egetañ
+egete
+egeti
+egeto
+eh
+eil
+eilvet
+eizh
+eizhvet
+ejoc'h
+ejod
+ejomp
+ejont
+ejout
+el
+em
+emaint
+emaoc'h
+emaomp
+emaon
+emaout
+emañ
+eme
+emeur
+emezañ
+emezi
+emezo
+emezoc'h
+emezomp
+emezon
+emezout
+emporzhiañ
+en
+end
+endan
+endra
+enep
+ennañ
+enni
+enno
+ennoc'h
+ennomp
+ennon
+ennor
+ennout
+enta
+eo
+eomp
+eont
+eor
+eot
+er
+erbet
+erfin
+esa
+esae
+espar
+estlamm
+estrañj
+eta
+etre
+etreoc'h
+etrezo
+etrezoc'h
+etrezomp
+etrezor
+euh
+eur
+eus
+evel
+evelato
+eveldoc'h
+eveldomp
+eveldon
+eveldor
+eveldout
+evelkent
+eveltañ
+evelte
+evelti
+evelto
+evidoc'h
+evidomp
+evidon
+evidor
+evidout
+evit
+evitañ
+evite
+eviti
+evito
+ez
+eñ
+f:
+fac'h
+fall
+fed
+feiz
+fenn
+fezh
+fin
+finsalvet
+foei
+fouilhezañ
+g:
+gallout
+ganeoc'h
+ganeomp
+ganin
+ganit
+gant
+gantañ
+ganti
+ganto
+gaout
+gast
+gein
+gellout
+genndost
+gentañ
+ger
+gerz
+get
+geñver
+gichen
+gin
+giz
+glan
+gloev
+goll
+gorre
+goude
+gouez
+gouezit
+gouezomp
+goulz
+gounnar
+gour
+goust
+gouze
+gouzout
+gra
+grak
+grec'h
+greiz
+grenn
+greomp
+grit
+groñs
+gutez
+gwall
+gwashoc'h
+gwazh
+gwech
+gwechall
+gwechoù
+gwell
+gwezh
+gwezhall
+gwezharall
+gwezhoù
+gwig
+gwirionez
+gwitibunan
+gêr
+h:
+ha
+hag
+han
+hanter
+hanterc'hantad
+hanterkantved
+harz
+hañ
+hañval
+he
+hebioù
+hec'h
+hei
+hein
+hem
+hemañ
+hen
+hend
+henhont
+henn
+hennezh
+hent
+hep
+hervez
+hervezañ
+hervezi
+hervezo
+hervezoc'h
+hervezomp
+hervezon
+hervezor
+hervezout
+heul
+heuliañ
+hevelep
+heverk
+heñvel
+heñvelat
+heñvelañ
+heñveliñ
+heñveloc'h
+heñvelout
+hi
+hilh
+hini
+hirie
+hirio
+hiziv
+hiziviken
+ho
+hoaliñ
+hoc'h
+hogen
+hogos
+hogozik
+hol
+holl
+holà
+homañ
+hon
+honhont
+honnezh
+hont
+hop
+hopala
+hor
+hou
+houp
+hudu
+hue
+hui
+hum
+hurrah
+i
+i:
+in
+int
+is
+ispisial
+isurzhiet
+it
+ivez
+izelañ
+j:
+just
+k:
+kae
+kaer
+kalon
+kalz
+kant
+kaout
+kar
+kazi
+keid
+kein
+keit
+kel
+kellies
+keloù
+kement
+ken
+kenkent
+kenkoulz
+kenment
+kent
+kentañ
+kentizh
+kentoc'h
+kentre
+ker
+kerkent
+kerz
+kerzh
+ket
+keta
+keñver
+keñverel
+keñverius
+kichen
+kichenik
+kit
+kiz
+klak
+klek
+klik
+komprenet
+komz
+kont
+korf
+korre
+koulskoude
+koulz
+koust
+krak
+krampouezh
+krec'h
+kreiz
+kuit
+kwir
+l:
+la
+laez
+laoskel
+laouen
+lavar
+lavaret
+lavarout
+lec'h
+lein
+leizh
+lerc'h
+leun
+leuskel
+lew
+lies
+liesañ
+lod
+lusk
+lâr
+lârout
+m:
+ma
+ma'z
+mac'h
+mac'hat
+mac'hañ
+mac'hoc'h
+mad
+maez
+maksimal
+mann
+mar
+mard
+marg
+marzh
+mat
+mañ
+me
+memes
+memestra
+merkapl
+mersi
+mes
+mesk
+met
+meur
+mil
+minimal
+moan
+moaniaat
+mod
+mont
+mout
+mui
+muiañ
+muioc'h
+n
+n'
+n:
+na
+nag
+naontek
+naturel
+nav
+navet
+ne
+nebeudig
+nebeut
+nebeutañ
+nebeutoc'h
+neketa
+nemedoc'h
+nemedomp
+nemedon
+nemedor
+nemedout
+nemet
+nemetañ
+nemete
+nemeti
+nemeto
+nemeur
+neoac'h
+nepell
+nerzh
+nes
+neseser
+netra
+neubeudoù
+neuhe
+neuze
+nevez
+newazh
+nez
+ni
+nikun
+niverus
+nul
+o
+o:
+oa
+oac'h
+oad
+oamp
+oan
+oant
+oar
+oas
+ober
+oc'h
+oc'ho
+oc'hola
+oc'hpenn
+oh
+ohe
+ollé
+olole
+olé
+omp
+on
+ordin
+ordinal
+ouejoc'h
+ouejod
+ouejomp
+ouejont
+ouejout
+ouek
+ouezas
+ouezi
+ouezimp
+ouezin
+ouezint
+ouezis
+ouezo
+ouezoc'h
+ouezor
+ouf
+oufe
+oufec'h
+oufed
+oufemp
+oufen
+oufent
+oufes
+ouie
+ouiec'h
+ouied
+ouiemp
+ouien
+ouient
+ouies
+ouije
+ouijec'h
+ouijed
+ouijemp
+ouijen
+ouijent
+ouijes
+out
+outañ
+outi
+outo
+ouzer
+ouzh
+ouzhin
+ouzhit
+ouzhoc'h
+ouzhomp
+ouzhor
+ouzhpenn
+ouzhpennik
+ouzoc'h
+ouzomp
+ouzon
+ouzont
+ouzout
+p'
+p:
+pa
+pad
+padal
+paf
+pan
+panevedeoc'h
+panevedo
+panevedomp
+panevedon
+panevedout
+panevet
+panevetañ
+paneveti
+pas
+paseet
+pe
+peadra
+peder
+pedervet
+pedervetvet
+pefe
+pegeit
+pegement
+pegen
+pegiz
+pegoulz
+pehini
+pelec'h
+pell
+pemod
+pemp
+pempved
+pemzek
+penaos
+penn
+peogwir
+peotramant
+pep
+perak
+perc'hennañ
+pergen
+permetiñ
+peseurt
+pet
+petiaoul
+petoare
+petra
+peur
+peurgetket
+peurheñvel
+peurliesañ
+peurvuiañ
+peus
+peustost
+peuz
+pevar
+pevare
+pevarevet
+pevarzek
+pez
+peze
+pezh
+pff
+pfft
+pfut
+picher
+pif
+pife
+pign
+pije
+pikol
+pitiaoul
+piv
+plaouf
+plok
+plouf
+po
+poa
+poelladus
+pof
+pok
+posupl
+pouah
+pourc'henn
+prest
+prestik
+prim
+prin
+provostapl
+pst
+pu
+pur
+r:
+ra
+rae
+raec'h
+raed
+raemp
+raen
+raent
+raes
+rafe
+rafec'h
+rafed
+rafemp
+rafen
+rafent
+rafes
+rag
+raimp
+raint
+raio
+raje
+rajec'h
+rajed
+rajemp
+rajen
+rajent
+rajes
+rak
+ral
+ran
+rankout
+raok
+razh
+re
+reas
+reer
+regennoù
+reiñ
+rejoc'h
+rejod
+rejomp
+rejont
+rejout
+rener
+rentañ
+reoc'h
+reomp
+reont
+reor
+reot
+resis
+ret
+reve
+rez
+ri
+rik
+rin
+ris
+rit
+rouez
+s:
+sac'h
+sant
+sav
+sañset
+se
+sed
+seitek
+seizh
+seizhvet
+sell
+sellit
+ser
+setu
+seul
+seurt
+siwazh
+skignañ
+skoaz
+skouer
+sort
+souden
+souvitañ
+soñj
+speriañ
+spririñ
+stad
+stlabezañ
+stop
+stranañ
+strewiñ
+strishaat
+stumm
+sujed
+surtoud
+t:
+ta
+taer
+tailh
+tak
+tal
+talvoudegezh
+tamm
+tanav
+taol
+te
+techet
+teir
+teirvet
+telt
+teltenn
+teus
+teut
+teuteu
+ti
+tik
+toa
+tok
+tost
+tostig
+toud
+touesk
+touez
+toull
+tra
+trantenn
+traoñ
+trawalc'h
+tre
+trede
+tregont
+tremenet
+tri
+trivet
+triwec'h
+trizek
+tro
+trugarez
+trumm
+tsoin
+tsouin
+tu
+tud
+u:
+ugent
+uhel
+uhelañ
+ul
+un
+unan
+unanez
+unanig
+unnek
+unnekvet
+ur
+urzh
+us
+v:
+va
+vale
+van
+vare
+vat
+vefe
+vefec'h
+vefed
+vefemp
+vefen
+vefent
+vefes
+vesk
+vete
+vez
+vezan
+vezañ
+veze
+vezec'h
+vezed
+vezemp
+vezen
+vezent
+vezer
+vezes
+vezez
+vezit
+vezomp
+vezont
+vi
+vihan
+vihanañ
+vije
+vijec'h
+vijed
+vijemp
+vijen
+vijent
+vijes
+viken
+vimp
+vin
+vint
+vior
+viot
+virviken
+viskoazh
+vlan
+vlaou
+vo
+vod
+voe
+voec'h
+voed
+voemp
+voen
+voent
+voes
+vont
+vostapl
+vrac'h
+vrasañ
+vremañ
+w:
+walc'h
+war
+warnañ
+warni
+warno
+warnoc'h
+warnomp
+warnon
+warnor
+warnout
+wazh
+wech
+wechoù
+well
+y:
+you
+youadenn
+youc'hadenn
+youc'hou
+z:
+za
+zan
+zaw
+zeu
+zi
+ziar
+zigarez
+ziget
+zindan
+zioc'h
+ziouzh
+zirak
+zivout
+ziwar
+ziwezhañ
+zo
+zoken
+zokenoc'h
+zouesk
+zouez
+zro
+zu
\ No newline at end of file
--- /dev/null
+a
+abans
+ací
+ah
+així
+això
+al
+aleshores
+algun
+alguna
+algunes
+alguns
+alhora
+allà
+allí
+allò
+als
+altra
+altre
+altres
+amb
+ambdues
+ambdós
+anar
+ans
+apa
+aquell
+aquella
+aquelles
+aquells
+aquest
+aquesta
+aquestes
+aquests
+aquí
+baix
+bastant
+bé
+cada
+cadascuna
+cadascunes
+cadascuns
+cadascú
+com
+consegueixo
+conseguim
+conseguir
+consigueix
+consigueixen
+consigueixes
+contra
+d'un
+d'una
+d'unes
+d'uns
+dalt
+de
+del
+dels
+des
+des de
+després
+dins
+dintre
+donat
+doncs
+durant
+e
+eh
+el
+elles
+ells
+els
+em
+en
+encara
+ens
+entre
+era
+erem
+eren
+eres
+es
+esta
+estan
+estat
+estava
+estaven
+estem
+esteu
+estic
+està
+estàvem
+estàveu
+et
+etc
+ets
+fa
+faig
+fan
+fas
+fem
+fer
+feu
+fi
+fins
+fora
+gairebé
+ha
+han
+has
+haver
+havia
+he
+hem
+heu
+hi
+ho
+i
+igual
+iguals
+inclòs
+ja
+jo
+l'hi
+la
+les
+li
+li'n
+llarg
+llavors
+m'he
+ma
+mal
+malgrat
+mateix
+mateixa
+mateixes
+mateixos
+me
+mentre
+meu
+meus
+meva
+meves
+mode
+molt
+molta
+moltes
+molts
+mon
+mons
+més
+n'he
+n'hi
+ne
+ni
+no
+nogensmenys
+només
+nosaltres
+nostra
+nostre
+nostres
+o
+oh
+oi
+on
+pas
+pel
+pels
+per
+per que
+perquè
+però
+poc
+poca
+pocs
+podem
+poden
+poder
+podeu
+poques
+potser
+primer
+propi
+puc
+qual
+quals
+quan
+quant
+que
+quelcom
+qui
+quin
+quina
+quines
+quins
+què
+s'ha
+s'han
+sa
+sabem
+saben
+saber
+sabeu
+sap
+saps
+semblant
+semblants
+sense
+ser
+ses
+seu
+seus
+seva
+seves
+si
+sobre
+sobretot
+soc
+solament
+sols
+som
+son
+sons
+sota
+sou
+sóc
+són
+t'ha
+t'han
+t'he
+ta
+tal
+també
+tampoc
+tan
+tant
+tanta
+tantes
+te
+tene
+tenim
+tenir
+teniu
+teu
+teus
+teva
+teves
+tinc
+ton
+tons
+tot
+tota
+totes
+tots
+un
+una
+unes
+uns
+us
+va
+vaig
+vam
+van
+vas
+veu
+vosaltres
+vostra
+vostre
+vostres
+érem
+éreu
+és
+éssent
+últim
+ús
\ No newline at end of file
--- /dev/null
+a
+aby
+ahoj
+aj
+ale
+anebo
+ani
+aniž
+ano
+asi
+aspoň
+atd
+atp
+az
+ačkoli
+až
+bez
+beze
+blízko
+bohužel
+brzo
+bude
+budem
+budeme
+budes
+budete
+budeš
+budou
+budu
+by
+byl
+byla
+byli
+bylo
+byly
+bys
+byt
+být
+během
+chce
+chceme
+chcete
+chceš
+chci
+chtít
+chtějí
+chut'
+chuti
+ci
+clanek
+clanku
+clanky
+co
+coz
+což
+cz
+daleko
+dalsi
+další
+den
+deset
+design
+devatenáct
+devět
+dnes
+do
+dobrý
+docela
+dva
+dvacet
+dvanáct
+dvě
+dál
+dále
+děkovat
+děkujeme
+děkuji
+email
+ho
+hodně
+i
+jak
+jakmile
+jako
+jakož
+jde
+je
+jeden
+jedenáct
+jedna
+jedno
+jednou
+jedou
+jeho
+jehož
+jej
+jeji
+jejich
+její
+jelikož
+jemu
+jen
+jenom
+jenž
+jeste
+jestli
+jestliže
+ještě
+jež
+ji
+jich
+jimi
+jinak
+jine
+jiné
+jiz
+již
+jsem
+jses
+jseš
+jsi
+jsme
+jsou
+jste
+já
+jí
+jím
+jíž
+jšte
+k
+kam
+každý
+kde
+kdo
+kdy
+kdyz
+když
+ke
+kolik
+kromě
+ktera
+ktere
+kteri
+kterou
+ktery
+která
+které
+který
+kteři
+kteří
+ku
+kvůli
+ma
+mají
+mate
+me
+mezi
+mi
+mit
+mne
+mnou
+mně
+moc
+mohl
+mohou
+moje
+moji
+možná
+muj
+musí
+muze
+my
+má
+málo
+mám
+máme
+máte
+máš
+mé
+mí
+mít
+mě
+můj
+může
+na
+nad
+nade
+nam
+napiste
+napište
+naproti
+nas
+nasi
+načež
+naše
+naši
+ne
+nebo
+nebyl
+nebyla
+nebyli
+nebyly
+nechť
+nedělají
+nedělá
+nedělám
+neděláme
+neděláte
+neděláš
+neg
+nejsi
+nejsou
+nemají
+nemáme
+nemáte
+neměl
+neni
+není
+nestačí
+nevadí
+nez
+než
+nic
+nich
+nimi
+nove
+novy
+nové
+nový
+nula
+ná
+nám
+námi
+nás
+náš
+ní
+ním
+ně
+něco
+nějak
+někde
+někdo
+němu
+němuž
+o
+od
+ode
+on
+ona
+oni
+ono
+ony
+osm
+osmnáct
+pak
+patnáct
+po
+pod
+podle
+pokud
+potom
+pouze
+pozdě
+pořád
+prave
+pravé
+pred
+pres
+pri
+pro
+proc
+prostě
+prosím
+proti
+proto
+protoze
+protože
+proč
+prvni
+první
+práve
+pta
+pět
+před
+přede
+přes
+přese
+při
+přičemž
+re
+rovně
+s
+se
+sedm
+sedmnáct
+si
+sice
+skoro
+smí
+smějí
+snad
+spolu
+sta
+sto
+strana
+sté
+sve
+svych
+svym
+svymi
+své
+svých
+svým
+svými
+svůj
+ta
+tady
+tak
+take
+takhle
+taky
+takze
+také
+takže
+tam
+tamhle
+tamhleto
+tamto
+tato
+te
+tebe
+tebou
+ted'
+tedy
+tema
+ten
+tento
+teto
+ti
+tim
+timto
+tipy
+tisíc
+tisíce
+to
+tobě
+tohle
+toho
+tohoto
+tom
+tomto
+tomu
+tomuto
+toto
+trošku
+tu
+tuto
+tvoje
+tvá
+tvé
+tvůj
+ty
+tyto
+téma
+této
+tím
+tímto
+tě
+těm
+těma
+těmu
+třeba
+tři
+třináct
+u
+určitě
+uz
+už
+v
+vam
+vas
+vase
+vaše
+vaši
+ve
+vedle
+večer
+vice
+vlastně
+vsak
+vy
+vám
+vámi
+vás
+váš
+více
+však
+všechen
+všechno
+všichni
+vůbec
+vždy
+z
+za
+zatímco
+zač
+zda
+zde
+ze
+zpet
+zpravy
+zprávy
+zpět
+čau
+či
+článek
+článku
+články
+čtrnáct
+čtyři
+šest
+šestnáct
+že
\ No newline at end of file
--- /dev/null
+ad
+af
+aldrig
+alle
+alt
+anden
+andet
+andre
+at
+bare
+begge
+blev
+blive
+bliver
+da
+de
+dem
+den
+denne
+der
+deres
+det
+dette
+dig
+din
+dine
+disse
+dit
+dog
+du
+efter
+ej
+eller
+en
+end
+ene
+eneste
+enhver
+er
+et
+far
+fem
+fik
+fire
+flere
+fleste
+for
+fordi
+forrige
+fra
+få
+får
+før
+god
+godt
+ham
+han
+hans
+har
+havde
+have
+hej
+helt
+hende
+hendes
+her
+hos
+hun
+hvad
+hvem
+hver
+hvilken
+hvis
+hvor
+hvordan
+hvorfor
+hvornår
+i
+ikke
+ind
+ingen
+intet
+ja
+jeg
+jer
+jeres
+jo
+kan
+kom
+komme
+kommer
+kun
+kunne
+lad
+lav
+lidt
+lige
+lille
+man
+mand
+mange
+med
+meget
+men
+mens
+mere
+mig
+min
+mine
+mit
+mod
+må
+ned
+nej
+ni
+nogen
+noget
+nogle
+nu
+ny
+nyt
+når
+nær
+næste
+næsten
+og
+også
+okay
+om
+op
+os
+otte
+over
+på
+se
+seks
+selv
+ser
+ses
+sig
+sige
+sin
+sine
+sit
+skal
+skulle
+som
+stor
+store
+syv
+så
+sådan
+tag
+tage
+thi
+ti
+til
+to
+tre
+ud
+under
+var
+ved
+vi
+vil
+ville
+vor
+vores
+være
+været
\ No newline at end of file
--- /dev/null
+a
+ab
+aber
+ach
+acht
+achte
+achten
+achter
+achtes
+ag
+alle
+allein
+allem
+allen
+aller
+allerdings
+alles
+allgemeinen
+als
+also
+am
+an
+ander
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+au
+auch
+auf
+aus
+ausser
+ausserdem
+außer
+außerdem
+b
+bald
+bei
+beide
+beiden
+beim
+beispiel
+bekannt
+bereits
+besonders
+besser
+besten
+bin
+bis
+bisher
+bist
+c
+d
+d.h
+da
+dabei
+dadurch
+dafür
+dagegen
+daher
+dahin
+dahinter
+damals
+damit
+danach
+daneben
+dank
+dann
+daran
+darauf
+daraus
+darf
+darfst
+darin
+darum
+darunter
+darüber
+das
+dasein
+daselbst
+dass
+dasselbe
+davon
+davor
+dazu
+dazwischen
+daß
+dein
+deine
+deinem
+deinen
+deiner
+deines
+dem
+dementsprechend
+demgegenüber
+demgemäss
+demgemäß
+demselben
+demzufolge
+den
+denen
+denn
+denselben
+der
+deren
+derer
+derjenige
+derjenigen
+dermassen
+dermaßen
+derselbe
+derselben
+des
+deshalb
+desselben
+dessen
+deswegen
+dich
+die
+diejenige
+diejenigen
+dies
+diese
+dieselbe
+dieselben
+diesem
+diesen
+dieser
+dieses
+dir
+doch
+dort
+drei
+drin
+dritte
+dritten
+dritter
+drittes
+du
+durch
+durchaus
+durfte
+durften
+dürfen
+dürft
+e
+eben
+ebenso
+ehrlich
+ei
+ei,
+eigen
+eigene
+eigenen
+eigener
+eigenes
+ein
+einander
+eine
+einem
+einen
+einer
+eines
+einig
+einige
+einigem
+einigen
+einiger
+einiges
+einmal
+eins
+elf
+en
+ende
+endlich
+entweder
+er
+ernst
+erst
+erste
+ersten
+erster
+erstes
+es
+etwa
+etwas
+euch
+euer
+eure
+eurem
+euren
+eurer
+eures
+f
+folgende
+früher
+fünf
+fünfte
+fünften
+fünfter
+fünftes
+für
+g
+gab
+ganz
+ganze
+ganzen
+ganzer
+ganzes
+gar
+gedurft
+gegen
+gegenüber
+gehabt
+gehen
+geht
+gekannt
+gekonnt
+gemacht
+gemocht
+gemusst
+genug
+gerade
+gern
+gesagt
+geschweige
+gewesen
+gewollt
+geworden
+gibt
+ging
+gleich
+gott
+gross
+grosse
+grossen
+grosser
+grosses
+groß
+große
+großen
+großer
+großes
+gut
+gute
+guter
+gutes
+h
+hab
+habe
+haben
+habt
+hast
+hat
+hatte
+hatten
+hattest
+hattet
+heisst
+her
+heute
+hier
+hin
+hinter
+hoch
+hätte
+hätten
+i
+ich
+ihm
+ihn
+ihnen
+ihr
+ihre
+ihrem
+ihren
+ihrer
+ihres
+im
+immer
+in
+indem
+infolgedessen
+ins
+irgend
+ist
+j
+ja
+jahr
+jahre
+jahren
+je
+jede
+jedem
+jeden
+jeder
+jedermann
+jedermanns
+jedes
+jedoch
+jemand
+jemandem
+jemanden
+jene
+jenem
+jenen
+jener
+jenes
+jetzt
+k
+kam
+kann
+kannst
+kaum
+kein
+keine
+keinem
+keinen
+keiner
+keines
+kleine
+kleinen
+kleiner
+kleines
+kommen
+kommt
+konnte
+konnten
+kurz
+können
+könnt
+könnte
+l
+lang
+lange
+leicht
+leide
+lieber
+los
+m
+machen
+macht
+machte
+mag
+magst
+mahn
+mal
+man
+manche
+manchem
+manchen
+mancher
+manches
+mann
+mehr
+mein
+meine
+meinem
+meinen
+meiner
+meines
+mensch
+menschen
+mich
+mir
+mit
+mittel
+mochte
+mochten
+morgen
+muss
+musst
+musste
+mussten
+muß
+mußt
+möchte
+mögen
+möglich
+mögt
+müssen
+müsst
+müßt
+n
+na
+nach
+nachdem
+nahm
+natürlich
+neben
+nein
+neue
+neuen
+neun
+neunte
+neunten
+neunter
+neuntes
+nicht
+nichts
+nie
+niemand
+niemandem
+niemanden
+noch
+nun
+nur
+o
+ob
+oben
+oder
+offen
+oft
+ohne
+ordnung
+p
+q
+r
+recht
+rechte
+rechten
+rechter
+rechtes
+richtig
+rund
+s
+sa
+sache
+sagt
+sagte
+sah
+satt
+schlecht
+schluss
+schon
+sechs
+sechste
+sechsten
+sechster
+sechstes
+sehr
+sei
+seid
+seien
+sein
+seine
+seinem
+seinen
+seiner
+seines
+seit
+seitdem
+selbst
+sich
+sie
+sieben
+siebente
+siebenten
+siebenter
+siebentes
+sind
+so
+solang
+solche
+solchem
+solchen
+solcher
+solches
+soll
+sollen
+sollst
+sollt
+sollte
+sollten
+sondern
+sonst
+soweit
+sowie
+später
+startseite
+statt
+steht
+suche
+t
+tag
+tage
+tagen
+tat
+teil
+tel
+tritt
+trotzdem
+tun
+u
+uhr
+um
+und
+uns
+unse
+unsem
+unsen
+unser
+unsere
+unserer
+unses
+unter
+v
+vergangenen
+viel
+viele
+vielem
+vielen
+vielleicht
+vier
+vierte
+vierten
+vierter
+viertes
+vom
+von
+vor
+w
+wahr
+wann
+war
+waren
+warst
+wart
+warum
+was
+weg
+wegen
+weil
+weit
+weiter
+weitere
+weiteren
+weiteres
+welche
+welchem
+welchen
+welcher
+welches
+wem
+wen
+wenig
+wenige
+weniger
+weniges
+wenigstens
+wenn
+wer
+werde
+werden
+werdet
+weshalb
+wessen
+wie
+wieder
+wieso
+will
+willst
+wir
+wird
+wirklich
+wirst
+wissen
+wo
+woher
+wohin
+wohl
+wollen
+wollt
+wollte
+wollten
+worden
+wurde
+wurden
+während
+währenddem
+währenddessen
+wäre
+würde
+würden
+x
+y
+z
+z.b
+zehn
+zehnte
+zehnten
+zehnter
+zehntes
+zeit
+zu
+zuerst
+zugleich
+zum
+zunächst
+zur
+zurück
+zusammen
+zwanzig
+zwar
+zwei
+zweite
+zweiten
+zweiter
+zweites
+zwischen
+zwölf
+über
+überhaupt
+übrigens
\ No newline at end of file
--- /dev/null
+ένα
+έναν
+ένας
+αι
+ακομα
+ακομη
+ακριβως
+αληθεια
+αληθινα
+αλλα
+αλλαχου
+αλλες
+αλλη
+αλλην
+αλλης
+αλλιως
+αλλιωτικα
+αλλο
+αλλοι
+αλλοιως
+αλλοιωτικα
+αλλον
+αλλος
+αλλοτε
+αλλου
+αλλους
+αλλων
+αμα
+αμεσα
+αμεσως
+αν
+ανα
+αναμεσα
+αναμεταξυ
+ανευ
+αντι
+αντιπερα
+αντις
+ανω
+ανωτερω
+αξαφνα
+απ
+απεναντι
+απο
+αποψε
+από
+αρα
+αραγε
+αργα
+αργοτερο
+αριστερα
+αρκετα
+αρχικα
+ας
+αυριο
+αυτα
+αυτες
+αυτεσ
+αυτη
+αυτην
+αυτης
+αυτο
+αυτοι
+αυτον
+αυτος
+αυτοσ
+αυτου
+αυτους
+αυτουσ
+αυτων
+αφοτου
+αφου
+αἱ
+αἳ
+αἵ
+αὐτόσ
+αὐτὸς
+αὖ
+α∆ιακοπα
+βεβαια
+βεβαιοτατα
+γάρ
+γα
+γα^
+γε
+γι
+για
+γοῦν
+γρηγορα
+γυρω
+γὰρ
+δ'
+δέ
+δή
+δαί
+δαίσ
+δαὶ
+δαὶς
+δε
+δεν
+δι
+δι'
+διά
+δια
+διὰ
+δὲ
+δὴ
+δ’
+εαν
+εαυτο
+εαυτον
+εαυτου
+εαυτους
+εαυτων
+εγκαιρα
+εγκαιρως
+εγω
+ειθε
+ειμαι
+ειμαστε
+ειναι
+εις
+εισαι
+εισαστε
+ειστε
+ειτε
+ειχα
+ειχαμε
+ειχαν
+ειχατε
+ειχε
+ειχες
+ει∆εμη
+εκ
+εκαστα
+εκαστες
+εκαστη
+εκαστην
+εκαστης
+εκαστο
+εκαστοι
+εκαστον
+εκαστος
+εκαστου
+εκαστους
+εκαστων
+εκει
+εκεινα
+εκεινες
+εκεινεσ
+εκεινη
+εκεινην
+εκεινης
+εκεινο
+εκεινοι
+εκεινον
+εκεινος
+εκεινοσ
+εκεινου
+εκεινους
+εκεινουσ
+εκεινων
+εκτος
+εμας
+εμεις
+εμενα
+εμπρος
+εν
+ενα
+εναν
+ενας
+ενος
+εντελως
+εντος
+εντωμεταξυ
+ενω
+ενός
+εξ
+εξαφνα
+εξης
+εξισου
+εξω
+επ
+επί
+επανω
+επειτα
+επει∆η
+επι
+επισης
+επομενως
+εσας
+εσεις
+εσενα
+εστω
+εσυ
+ετερα
+ετεραι
+ετερας
+ετερες
+ετερη
+ετερης
+ετερο
+ετεροι
+ετερον
+ετερος
+ετερου
+ετερους
+ετερων
+ετουτα
+ετουτες
+ετουτη
+ετουτην
+ετουτης
+ετουτο
+ετουτοι
+ετουτον
+ετουτος
+ετουτου
+ετουτους
+ετουτων
+ετσι
+ευγε
+ευθυς
+ευτυχως
+εφεξης
+εχει
+εχεις
+εχετε
+εχθες
+εχομε
+εχουμε
+εχουν
+εχτες
+εχω
+εως
+εἰ
+εἰμί
+εἰμὶ
+εἰς
+εἰσ
+εἴ
+εἴμι
+εἴτε
+ε∆ω
+η
+ημασταν
+ημαστε
+ημουν
+ησασταν
+ησαστε
+ησουν
+ηταν
+ητανε
+ητοι
+ηττον
+η∆η
+θα
+ι
+ιι
+ιιι
+ισαμε
+ισια
+ισως
+ισωσ
+ι∆ια
+ι∆ιαν
+ι∆ιας
+ι∆ιες
+ι∆ιο
+ι∆ιοι
+ι∆ιον
+ι∆ιος
+ι∆ιου
+ι∆ιους
+ι∆ιων
+ι∆ιως
+κ
+καί
+καίτοι
+καθ
+καθε
+καθεμια
+καθεμιας
+καθενα
+καθενας
+καθενος
+καθετι
+καθολου
+καθως
+και
+κακα
+κακως
+καλα
+καλως
+καμια
+καμιαν
+καμιας
+καμποσα
+καμποσες
+καμποση
+καμποσην
+καμποσης
+καμποσο
+καμποσοι
+καμποσον
+καμποσος
+καμποσου
+καμποσους
+καμποσων
+κανεις
+κανεν
+κανενα
+κανεναν
+κανενας
+κανενος
+καποια
+καποιαν
+καποιας
+καποιες
+καποιο
+καποιοι
+καποιον
+καποιος
+καποιου
+καποιους
+καποιων
+καποτε
+καπου
+καπως
+κατ
+κατά
+κατα
+κατι
+κατιτι
+κατοπιν
+κατω
+κατὰ
+καὶ
+κι
+κιολας
+κλπ
+κοντα
+κτλ
+κυριως
+κἀν
+κἂν
+λιγακι
+λιγο
+λιγωτερο
+λογω
+λοιπα
+λοιπον
+μέν
+μέσα
+μή
+μήτε
+μία
+μα
+μαζι
+μακαρι
+μακρυα
+μαλιστα
+μαλλον
+μας
+με
+μεθ
+μεθαυριο
+μειον
+μελει
+μελλεται
+μεμιας
+μεν
+μερικα
+μερικες
+μερικοι
+μερικους
+μερικων
+μεσα
+μετ
+μετά
+μετα
+μεταξυ
+μετὰ
+μεχρι
+μη
+μην
+μηπως
+μητε
+μη∆ε
+μιά
+μια
+μιαν
+μιας
+μολις
+μολονοτι
+μοναχα
+μονες
+μονη
+μονην
+μονης
+μονο
+μονοι
+μονομιας
+μονος
+μονου
+μονους
+μονων
+μου
+μπορει
+μπορουν
+μπραβο
+μπρος
+μἐν
+μὲν
+μὴ
+μὴν
+να
+ναι
+νωρις
+ξανα
+ξαφνικα
+ο
+οι
+ολα
+ολες
+ολη
+ολην
+ολης
+ολο
+ολογυρα
+ολοι
+ολον
+ολονεν
+ολος
+ολοτελα
+ολου
+ολους
+ολων
+ολως
+ολως∆ιολου
+ομως
+ομωσ
+οποια
+οποιαν
+οποιαν∆ηποτε
+οποιας
+οποιας∆ηποτε
+οποια∆ηποτε
+οποιες
+οποιες∆ηποτε
+οποιο
+οποιοι
+οποιον
+οποιον∆ηποτε
+οποιος
+οποιος∆ηποτε
+οποιου
+οποιους
+οποιους∆ηποτε
+οποιου∆ηποτε
+οποιο∆ηποτε
+οποιων
+οποιων∆ηποτε
+οποι∆ηποτε
+οποτε
+οποτε∆ηποτε
+οπου
+οπου∆ηποτε
+οπως
+οπωσ
+ορισμενα
+ορισμενες
+ορισμενων
+ορισμενως
+οσα
+οσα∆ηποτε
+οσες
+οσες∆ηποτε
+οση
+οσην
+οσην∆ηποτε
+οσης
+οσης∆ηποτε
+οση∆ηποτε
+οσο
+οσοι
+οσοι∆ηποτε
+οσον
+οσον∆ηποτε
+οσος
+οσος∆ηποτε
+οσου
+οσους
+οσους∆ηποτε
+οσου∆ηποτε
+οσο∆ηποτε
+οσων
+οσων∆ηποτε
+οταν
+οτι
+οτι∆ηποτε
+οτου
+ου
+ουτε
+ου∆ε
+οχι
+οἱ
+οἳ
+οἷς
+οὐ
+οὐδ
+οὐδέ
+οὐδείσ
+οὐδεὶς
+οὐδὲ
+οὐδὲν
+οὐκ
+οὐχ
+οὐχὶ
+οὓς
+οὔτε
+οὕτω
+οὕτως
+οὕτωσ
+οὖν
+οὗ
+οὗτος
+οὗτοσ
+παλι
+παντοτε
+παντου
+παντως
+παρ
+παρά
+παρα
+παρὰ
+περί
+περα
+περι
+περιπου
+περισσοτερο
+περσι
+περυσι
+περὶ
+πια
+πιθανον
+πιο
+πισω
+πλαι
+πλεον
+πλην
+ποια
+ποιαν
+ποιας
+ποιες
+ποιεσ
+ποιο
+ποιοι
+ποιον
+ποιος
+ποιοσ
+ποιου
+ποιους
+ποιουσ
+ποιων
+πολυ
+ποσες
+ποση
+ποσην
+ποσης
+ποσοι
+ποσος
+ποσους
+ποτε
+που
+πουθε
+πουθενα
+ποῦ
+πρεπει
+πριν
+προ
+προκειμενου
+προκειται
+προπερσι
+προς
+προσ
+προτου
+προχθες
+προχτες
+πρωτυτερα
+πρόσ
+πρὸ
+πρὸς
+πως
+πωσ
+σαν
+σας
+σε
+σεις
+σημερα
+σιγα
+σου
+στα
+στη
+στην
+στης
+στις
+στο
+στον
+στου
+στους
+στων
+συγχρονως
+συν
+συναμα
+συνεπως
+συνηθως
+συχνα
+συχνας
+συχνες
+συχνη
+συχνην
+συχνης
+συχνο
+συχνοι
+συχνον
+συχνος
+συχνου
+συχνους
+συχνων
+συχνως
+σχε∆ον
+σωστα
+σόσ
+σύ
+σύν
+σὸς
+σὺ
+σὺν
+τά
+τήν
+τί
+τίς
+τίσ
+τα
+ταυτα
+ταυτες
+ταυτη
+ταυτην
+ταυτης
+ταυτο,ταυτον
+ταυτος
+ταυτου
+ταυτων
+ταχα
+ταχατε
+ταῖς
+τα∆ε
+τε
+τελικα
+τελικως
+τες
+τετοια
+τετοιαν
+τετοιας
+τετοιες
+τετοιο
+τετοιοι
+τετοιον
+τετοιος
+τετοιου
+τετοιους
+τετοιων
+τη
+την
+της
+τησ
+τι
+τινα
+τιποτα
+τιποτε
+τις
+τισ
+το
+τοί
+τοι
+τοιοῦτος
+τοιοῦτοσ
+τον
+τος
+τοσα
+τοσες
+τοση
+τοσην
+τοσης
+τοσο
+τοσοι
+τοσον
+τοσος
+τοσου
+τοσους
+τοσων
+τοτε
+του
+τουλαχιστο
+τουλαχιστον
+τους
+τουτα
+τουτες
+τουτη
+τουτην
+τουτης
+τουτο
+τουτοι
+τουτοις
+τουτον
+τουτος
+τουτου
+τουτους
+τουτων
+τούσ
+τοὺς
+τοῖς
+τοῦ
+τυχον
+των
+τωρα
+τό
+τόν
+τότε
+τὰ
+τὰς
+τὴν
+τὸ
+τὸν
+τῆς
+τῆσ
+τῇ
+τῶν
+τῷ
+υπ
+υπερ
+υπο
+υποψη
+υποψιν
+υπό
+υστερα
+φετος
+χαμηλα
+χθες
+χτες
+χωρις
+χωριστα
+ψηλα
+ω
+ωραια
+ως
+ωσ
+ωσαν
+ωσοτου
+ωσπου
+ωστε
+ωστοσο
+ωχ
+ἀλλ'
+ἀλλά
+ἀλλὰ
+ἀλλ’
+ἀπ
+ἀπό
+ἀπὸ
+ἀφ
+ἂν
+ἃ
+ἄλλος
+ἄλλοσ
+ἄν
+ἄρα
+ἅμα
+ἐάν
+ἐγώ
+ἐγὼ
+ἐκ
+ἐμόσ
+ἐμὸς
+ἐν
+ἐξ
+ἐπί
+ἐπεὶ
+ἐπὶ
+ἐστι
+ἐφ
+ἐὰν
+ἑαυτοῦ
+ἔτι
+ἡ
+ἢ
+ἣ
+ἤ
+ἥ
+ἧς
+ἵνα
+ὁ
+ὃ
+ὃν
+ὃς
+ὅ
+ὅδε
+ὅθεν
+ὅπερ
+ὅς
+ὅσ
+ὅστις
+ὅστισ
+ὅτε
+ὅτι
+ὑμόσ
+ὑπ
+ὑπέρ
+ὑπό
+ὑπὲρ
+ὑπὸ
+ὡς
+ὡσ
+ὥς
+ὥστε
+ὦ
+ᾧ
+∆α
+∆ε
+∆εινα
+∆εν
+∆εξια
+∆ηθεν
+∆ηλα∆η
+∆ι
+∆ια
+∆ιαρκως
+∆ικα
+∆ικο
+∆ικοι
+∆ικος
+∆ικου
+∆ικους
+∆ιολου
+∆ιπλα
+∆ιχως
\ No newline at end of file
--- /dev/null
+'ll
+'tis
+'twas
+'ve
+10
+39
+a
+a's
+able
+ableabout
+about
+above
+abroad
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+ad
+added
+adj
+adopted
+ae
+af
+affected
+affecting
+affects
+after
+afterwards
+ag
+again
+against
+ago
+ah
+ahead
+ai
+ain't
+aint
+al
+all
+allow
+allows
+almost
+alone
+along
+alongside
+already
+also
+although
+always
+am
+amid
+amidst
+among
+amongst
+amoungst
+amount
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+ao
+apart
+apparently
+appear
+appreciate
+appropriate
+approximately
+aq
+ar
+are
+area
+areas
+aren
+aren't
+arent
+arise
+around
+arpa
+as
+aside
+ask
+asked
+asking
+asks
+associated
+at
+au
+auth
+available
+aw
+away
+awfully
+az
+b
+ba
+back
+backed
+backing
+backs
+backward
+backwards
+bb
+bd
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+began
+begin
+beginning
+beginnings
+begins
+behind
+being
+beings
+believe
+below
+beside
+besides
+best
+better
+between
+beyond
+bf
+bg
+bh
+bi
+big
+bill
+billion
+biol
+bj
+bm
+bn
+bo
+both
+bottom
+br
+brief
+briefly
+bs
+bt
+but
+buy
+bv
+bw
+by
+bz
+c
+c'mon
+c's
+ca
+call
+came
+can
+can't
+cannot
+cant
+caption
+case
+cases
+cause
+causes
+cc
+cd
+certain
+certainly
+cf
+cg
+ch
+changes
+ci
+ck
+cl
+clear
+clearly
+click
+cm
+cmon
+cn
+co
+co.
+com
+come
+comes
+computer
+con
+concerning
+consequently
+consider
+considering
+contain
+containing
+contains
+copy
+corresponding
+could
+could've
+couldn
+couldn't
+couldnt
+course
+cr
+cry
+cs
+cu
+currently
+cv
+cx
+cy
+cz
+d
+dare
+daren't
+darent
+date
+de
+dear
+definitely
+describe
+described
+despite
+detail
+did
+didn
+didn't
+didnt
+differ
+different
+differently
+directly
+dj
+dk
+dm
+do
+does
+doesn
+doesn't
+doesnt
+doing
+don
+don't
+done
+dont
+doubtful
+down
+downed
+downing
+downs
+downwards
+due
+during
+dz
+e
+each
+early
+ec
+ed
+edu
+ee
+effect
+eg
+eh
+eight
+eighty
+either
+eleven
+else
+elsewhere
+empty
+end
+ended
+ending
+ends
+enough
+entirely
+er
+es
+especially
+et
+et-al
+etc
+even
+evenly
+ever
+evermore
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+f
+face
+faces
+fact
+facts
+fairly
+far
+farther
+felt
+few
+fewer
+ff
+fi
+fifteen
+fifth
+fifty
+fify
+fill
+find
+finds
+fire
+first
+five
+fix
+fj
+fk
+fm
+fo
+followed
+following
+follows
+for
+forever
+former
+formerly
+forth
+forty
+forward
+found
+four
+fr
+free
+from
+front
+full
+fully
+further
+furthered
+furthering
+furthermore
+furthers
+fx
+g
+ga
+gave
+gb
+gd
+ge
+general
+generally
+get
+gets
+getting
+gf
+gg
+gh
+gi
+give
+given
+gives
+giving
+gl
+gm
+gmt
+gn
+go
+goes
+going
+gone
+good
+goods
+got
+gotten
+gov
+gp
+gq
+gr
+great
+greater
+greatest
+greetings
+group
+grouped
+grouping
+groups
+gs
+gt
+gu
+gw
+gy
+h
+had
+hadn't
+hadnt
+half
+happens
+hardly
+has
+hasn
+hasn't
+hasnt
+have
+haven
+haven't
+havent
+having
+he
+he'd
+he'll
+he's
+hed
+hell
+hello
+help
+hence
+her
+here
+here's
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+herse”
+hes
+hi
+hid
+high
+higher
+highest
+him
+himself
+himse”
+his
+hither
+hk
+hm
+hn
+home
+homepage
+hopefully
+how
+how'd
+how'll
+how's
+howbeit
+however
+hr
+ht
+htm
+html
+http
+hu
+hundred
+i
+i'd
+i'll
+i'm
+i've
+i.e.
+id
+ie
+if
+ignored
+ii
+il
+ill
+im
+immediate
+immediately
+importance
+important
+in
+inasmuch
+inc
+inc.
+indeed
+index
+indicate
+indicated
+indicates
+information
+inner
+inside
+insofar
+instead
+int
+interest
+interested
+interesting
+interests
+into
+invention
+inward
+io
+iq
+ir
+is
+isn
+isn't
+isnt
+it
+it'd
+it'll
+it's
+itd
+itll
+its
+itself
+itse”
+ive
+j
+je
+jm
+jo
+join
+jp
+just
+k
+ke
+keep
+keeps
+kept
+keys
+kg
+kh
+ki
+kind
+km
+kn
+knew
+know
+known
+knows
+kp
+kr
+kw
+ky
+kz
+l
+la
+large
+largely
+last
+lately
+later
+latest
+latter
+latterly
+lb
+lc
+least
+length
+less
+lest
+let
+let's
+lets
+li
+like
+liked
+likely
+likewise
+line
+little
+lk
+ll
+long
+longer
+longest
+look
+looking
+looks
+low
+lower
+lr
+ls
+lt
+ltd
+lu
+lv
+ly
+m
+ma
+made
+mainly
+make
+makes
+making
+man
+many
+may
+maybe
+mayn't
+maynt
+mc
+md
+me
+mean
+means
+meantime
+meanwhile
+member
+members
+men
+merely
+mg
+mh
+microsoft
+might
+might've
+mightn't
+mightnt
+mil
+mill
+million
+mine
+minus
+miss
+mk
+ml
+mm
+mn
+mo
+more
+moreover
+most
+mostly
+move
+mp
+mq
+mr
+mrs
+ms
+msie
+mt
+mu
+much
+mug
+must
+must've
+mustn't
+mustnt
+mv
+mw
+mx
+my
+myself
+myse”
+mz
+n
+na
+name
+namely
+nay
+nc
+nd
+ne
+near
+nearly
+necessarily
+necessary
+need
+needed
+needing
+needn't
+neednt
+needs
+neither
+net
+netscape
+never
+neverf
+neverless
+nevertheless
+new
+newer
+newest
+next
+nf
+ng
+ni
+nine
+ninety
+nl
+no
+no-one
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+notwithstanding
+novel
+now
+nowhere
+np
+nr
+nu
+null
+number
+numbers
+nz
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+older
+oldest
+om
+omitted
+on
+once
+one
+one's
+ones
+only
+onto
+open
+opened
+opening
+opens
+opposite
+or
+ord
+order
+ordered
+ordering
+orders
+org
+other
+others
+otherwise
+ought
+oughtn't
+oughtnt
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+pa
+page
+pages
+part
+parted
+particular
+particularly
+parting
+parts
+past
+pe
+per
+perhaps
+pf
+pg
+ph
+pk
+pl
+place
+placed
+places
+please
+plus
+pm
+pmid
+pn
+point
+pointed
+pointing
+points
+poorly
+possible
+possibly
+potentially
+pp
+pr
+predominantly
+present
+presented
+presenting
+presents
+presumably
+previously
+primarily
+probably
+problem
+problems
+promptly
+proud
+provided
+provides
+pt
+put
+puts
+pw
+py
+q
+qa
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+reasonably
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+reserved
+respectively
+resulted
+resulting
+results
+right
+ring
+ro
+room
+rooms
+round
+ru
+run
+rw
+s
+sa
+said
+same
+saw
+say
+saying
+says
+sb
+sc
+sd
+se
+sec
+second
+secondly
+seconds
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+sees
+self
+selves
+sensible
+sent
+serious
+seriously
+seven
+seventy
+several
+sg
+sh
+shall
+shan't
+shant
+she
+she'd
+she'll
+she's
+shed
+shell
+shes
+should
+should've
+shouldn
+shouldn't
+shouldnt
+show
+showed
+showing
+shown
+showns
+shows
+si
+side
+sides
+significant
+significantly
+similar
+similarly
+since
+sincere
+site
+six
+sixty
+sj
+sk
+sl
+slightly
+sm
+small
+smaller
+smallest
+sn
+so
+some
+somebody
+someday
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+sr
+st
+state
+states
+still
+stop
+strongly
+su
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure
+sv
+sy
+system
+sz
+t
+t's
+take
+taken
+taking
+tc
+td
+tell
+ten
+tends
+test
+text
+tf
+tg
+th
+than
+thank
+thanks
+thanx
+that
+that'll
+that's
+that've
+thatll
+thats
+thatve
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+there'd
+there'll
+there're
+there's
+there've
+thereafter
+thereby
+thered
+therefore
+therein
+therell
+thereof
+therere
+theres
+thereto
+thereupon
+thereve
+these
+they
+they'd
+they'll
+they're
+they've
+theyd
+theyll
+theyre
+theyve
+thick
+thin
+thing
+things
+think
+thinks
+third
+thirty
+this
+thorough
+thoroughly
+those
+thou
+though
+thoughh
+thought
+thoughts
+thousand
+three
+throug
+through
+throughout
+thru
+thus
+til
+till
+tip
+tis
+tj
+tk
+tm
+tn
+to
+today
+together
+too
+took
+top
+toward
+towards
+tp
+tr
+tried
+tries
+trillion
+truly
+try
+trying
+ts
+tt
+turn
+turned
+turning
+turns
+tv
+tw
+twas
+twelve
+twenty
+twice
+two
+tz
+u
+ua
+ug
+uk
+um
+un
+under
+underneath
+undoing
+unfortunately
+unless
+unlike
+unlikely
+until
+unto
+up
+upon
+ups
+upwards
+us
+use
+used
+useful
+usefully
+usefulness
+uses
+using
+usually
+uucp
+uy
+uz
+v
+va
+value
+various
+vc
+ve
+versus
+very
+vg
+vi
+via
+viz
+vn
+vol
+vols
+vs
+vu
+w
+want
+wanted
+wanting
+wants
+was
+wasn
+wasn't
+wasnt
+way
+ways
+we
+we'd
+we'll
+we're
+we've
+web
+webpage
+website
+wed
+welcome
+well
+wells
+went
+were
+weren
+weren't
+werent
+weve
+wf
+what
+what'd
+what'll
+what's
+what've
+whatever
+whatll
+whats
+whatve
+when
+when'd
+when'll
+when's
+whence
+whenever
+where
+where'd
+where'll
+where's
+whereafter
+whereas
+whereby
+wherein
+wheres
+whereupon
+wherever
+whether
+which
+whichever
+while
+whilst
+whim
+whither
+who
+who'd
+who'll
+who's
+whod
+whoever
+whole
+wholl
+whom
+whomever
+whos
+whose
+why
+why'd
+why'll
+why's
+widely
+width
+will
+willing
+wish
+with
+within
+without
+won
+won't
+wonder
+wont
+words
+work
+worked
+working
+works
+world
+would
+would've
+wouldn
+wouldn't
+wouldnt
+ws
+www
+x
+y
+ye
+year
+years
+yes
+yet
+you
+you'd
+you'll
+you're
+you've
+youd
+youll
+young
+younger
+youngest
+your
+youre
+yours
+yourself
+yourselves
+youve
+yt
+yu
+z
+za
+zero
+zm
+zr
\ No newline at end of file
--- /dev/null
+adiaŭ
+ajn
+al
+ankoraŭ
+antaŭ
+aŭ
+bonan
+bonvole
+bonvolu
+bv
+ci
+cia
+cian
+cin
+d-ro
+da
+de
+dek
+deka
+do
+doktor'
+doktoro
+du
+dua
+dum
+eble
+ekz
+ekzemple
+en
+estas
+estis
+estos
+estu
+estus
+eĉ
+f-no
+feliĉan
+for
+fraŭlino
+ha
+havas
+havis
+havos
+havu
+havus
+he
+ho
+hu
+ili
+ilia
+ilian
+ilin
+inter
+io
+ion
+iu
+iujn
+iun
+ja
+jam
+je
+jes
+k
+kaj
+ke
+kio
+kion
+kiu
+kiujn
+kiun
+kvankam
+kvar
+kvara
+kvazaŭ
+kvin
+kvina
+la
+li
+lia
+lian
+lin
+malantaŭ
+male
+malgraŭ
+mem
+mi
+mia
+mian
+min
+minus
+naŭ
+naŭa
+ne
+nek
+nenio
+nenion
+neniu
+neniun
+nepre
+ni
+nia
+nian
+nin
+nu
+nun
+nur
+ok
+oka
+oni
+onia
+onian
+onin
+plej
+pli
+plu
+plus
+por
+post
+preter
+s-no
+s-ro
+se
+sed
+sep
+sepa
+ses
+sesa
+si
+sia
+sian
+sin
+sinjor'
+sinjorino
+sinjoro
+sub
+super
+supren
+sur
+tamen
+tio
+tion
+tiu
+tiujn
+tiun
+tra
+tri
+tria
+tuj
+tute
+unu
+unua
+ve
+verŝajne
+vi
+via
+vian
+vin
+ĉi
+ĉio
+ĉion
+ĉiu
+ĉiujn
+ĉiun
+ĉu
+ĝi
+ĝia
+ĝian
+ĝin
+ĝis
+ĵus
+ŝi
+ŝia
+ŝin
\ No newline at end of file
--- /dev/null
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+_
+a
+actualmente
+acuerdo
+adelante
+ademas
+además
+adrede
+afirmó
+agregó
+ahi
+ahora
+ahí
+al
+algo
+alguna
+algunas
+alguno
+algunos
+algún
+alli
+allí
+alrededor
+ambos
+ampleamos
+antano
+antaño
+ante
+anterior
+antes
+apenas
+aproximadamente
+aquel
+aquella
+aquellas
+aquello
+aquellos
+aqui
+aquél
+aquélla
+aquéllas
+aquéllos
+aquí
+arriba
+arribaabajo
+aseguró
+asi
+así
+atras
+aun
+aunque
+ayer
+añadió
+aún
+b
+bajo
+bastante
+bien
+breve
+buen
+buena
+buenas
+bueno
+buenos
+c
+cada
+casi
+cerca
+cierta
+ciertas
+cierto
+ciertos
+cinco
+claro
+comentó
+como
+con
+conmigo
+conocer
+conseguimos
+conseguir
+considera
+consideró
+consigo
+consigue
+consiguen
+consigues
+contigo
+contra
+cosas
+creo
+cual
+cuales
+cualquier
+cuando
+cuanta
+cuantas
+cuanto
+cuantos
+cuatro
+cuenta
+cuál
+cuáles
+cuándo
+cuánta
+cuántas
+cuánto
+cuántos
+cómo
+d
+da
+dado
+dan
+dar
+de
+debajo
+debe
+deben
+debido
+decir
+dejó
+del
+delante
+demasiado
+demás
+dentro
+deprisa
+desde
+despacio
+despues
+después
+detras
+detrás
+dia
+dias
+dice
+dicen
+dicho
+dieron
+diferente
+diferentes
+dijeron
+dijo
+dio
+donde
+dos
+durante
+día
+días
+dónde
+e
+ejemplo
+el
+ella
+ellas
+ello
+ellos
+embargo
+empleais
+emplean
+emplear
+empleas
+empleo
+en
+encima
+encuentra
+enfrente
+enseguida
+entonces
+entre
+era
+erais
+eramos
+eran
+eras
+eres
+es
+esa
+esas
+ese
+eso
+esos
+esta
+estaba
+estabais
+estaban
+estabas
+estad
+estada
+estadas
+estado
+estados
+estais
+estamos
+estan
+estando
+estar
+estaremos
+estará
+estarán
+estarás
+estaré
+estaréis
+estaría
+estaríais
+estaríamos
+estarían
+estarías
+estas
+este
+estemos
+esto
+estos
+estoy
+estuve
+estuviera
+estuvierais
+estuvieran
+estuvieras
+estuvieron
+estuviese
+estuvieseis
+estuviesen
+estuvieses
+estuvimos
+estuviste
+estuvisteis
+estuviéramos
+estuviésemos
+estuvo
+está
+estábamos
+estáis
+están
+estás
+esté
+estéis
+estén
+estés
+ex
+excepto
+existe
+existen
+explicó
+expresó
+f
+fin
+final
+fue
+fuera
+fuerais
+fueran
+fueras
+fueron
+fuese
+fueseis
+fuesen
+fueses
+fui
+fuimos
+fuiste
+fuisteis
+fuéramos
+fuésemos
+g
+general
+gran
+grandes
+gueno
+h
+ha
+haber
+habia
+habida
+habidas
+habido
+habidos
+habiendo
+habla
+hablan
+habremos
+habrá
+habrán
+habrás
+habré
+habréis
+habría
+habríais
+habríamos
+habrían
+habrías
+habéis
+había
+habíais
+habíamos
+habían
+habías
+hace
+haceis
+hacemos
+hacen
+hacer
+hacerlo
+haces
+hacia
+haciendo
+hago
+han
+has
+hasta
+hay
+haya
+hayamos
+hayan
+hayas
+hayáis
+he
+hecho
+hemos
+hicieron
+hizo
+horas
+hoy
+hube
+hubiera
+hubierais
+hubieran
+hubieras
+hubieron
+hubiese
+hubieseis
+hubiesen
+hubieses
+hubimos
+hubiste
+hubisteis
+hubiéramos
+hubiésemos
+hubo
+i
+igual
+incluso
+indicó
+informo
+informó
+intenta
+intentais
+intentamos
+intentan
+intentar
+intentas
+intento
+ir
+j
+junto
+k
+l
+la
+lado
+largo
+las
+le
+lejos
+les
+llegó
+lleva
+llevar
+lo
+los
+luego
+lugar
+m
+mal
+manera
+manifestó
+mas
+mayor
+me
+mediante
+medio
+mejor
+mencionó
+menos
+menudo
+mi
+mia
+mias
+mientras
+mio
+mios
+mis
+misma
+mismas
+mismo
+mismos
+modo
+momento
+mucha
+muchas
+mucho
+muchos
+muy
+más
+mí
+mía
+mías
+mío
+míos
+n
+nada
+nadie
+ni
+ninguna
+ningunas
+ninguno
+ningunos
+ningún
+no
+nos
+nosotras
+nosotros
+nuestra
+nuestras
+nuestro
+nuestros
+nueva
+nuevas
+nuevo
+nuevos
+nunca
+o
+ocho
+os
+otra
+otras
+otro
+otros
+p
+pais
+para
+parece
+parte
+partir
+pasada
+pasado
+paìs
+peor
+pero
+pesar
+poca
+pocas
+poco
+pocos
+podeis
+podemos
+poder
+podria
+podriais
+podriamos
+podrian
+podrias
+podrá
+podrán
+podría
+podrían
+poner
+por
+por qué
+porque
+posible
+primer
+primera
+primero
+primeros
+principalmente
+pronto
+propia
+propias
+propio
+propios
+proximo
+próximo
+próximos
+pudo
+pueda
+puede
+pueden
+puedo
+pues
+q
+qeu
+que
+quedó
+queremos
+quien
+quienes
+quiere
+quiza
+quizas
+quizá
+quizás
+quién
+quiénes
+qué
+r
+raras
+realizado
+realizar
+realizó
+repente
+respecto
+s
+sabe
+sabeis
+sabemos
+saben
+saber
+sabes
+sal
+salvo
+se
+sea
+seamos
+sean
+seas
+segun
+segunda
+segundo
+según
+seis
+ser
+sera
+seremos
+será
+serán
+serás
+seré
+seréis
+sería
+seríais
+seríamos
+serían
+serías
+seáis
+señaló
+si
+sido
+siempre
+siendo
+siete
+sigue
+siguiente
+sin
+sino
+sobre
+sois
+sola
+solamente
+solas
+solo
+solos
+somos
+son
+soy
+soyos
+su
+supuesto
+sus
+suya
+suyas
+suyo
+suyos
+sé
+sí
+sólo
+t
+tal
+tambien
+también
+tampoco
+tan
+tanto
+tarde
+te
+temprano
+tendremos
+tendrá
+tendrán
+tendrás
+tendré
+tendréis
+tendría
+tendríais
+tendríamos
+tendrían
+tendrías
+tened
+teneis
+tenemos
+tener
+tenga
+tengamos
+tengan
+tengas
+tengo
+tengáis
+tenida
+tenidas
+tenido
+tenidos
+teniendo
+tenéis
+tenía
+teníais
+teníamos
+tenían
+tenías
+tercera
+ti
+tiempo
+tiene
+tienen
+tienes
+toda
+todas
+todavia
+todavía
+todo
+todos
+total
+trabaja
+trabajais
+trabajamos
+trabajan
+trabajar
+trabajas
+trabajo
+tras
+trata
+través
+tres
+tu
+tus
+tuve
+tuviera
+tuvierais
+tuvieran
+tuvieras
+tuvieron
+tuviese
+tuvieseis
+tuviesen
+tuvieses
+tuvimos
+tuviste
+tuvisteis
+tuviéramos
+tuviésemos
+tuvo
+tuya
+tuyas
+tuyo
+tuyos
+tú
+u
+ultimo
+un
+una
+unas
+uno
+unos
+usa
+usais
+usamos
+usan
+usar
+usas
+uso
+usted
+ustedes
+v
+va
+vais
+valor
+vamos
+van
+varias
+varios
+vaya
+veces
+ver
+verdad
+verdadera
+verdadero
+vez
+vosotras
+vosotros
+voy
+vuestra
+vuestras
+vuestro
+vuestros
+w
+x
+y
+ya
+yo
+z
+él
+éramos
+ésa
+ésas
+ése
+ésos
+ésta
+éstas
+éste
+éstos
+última
+últimas
+último
+últimos
\ No newline at end of file
--- /dev/null
+aga
+ei
+et
+ja
+jah
+kas
+kui
+kõik
+ma
+me
+mida
+midagi
+mind
+minu
+mis
+mu
+mul
+mulle
+nad
+nii
+oled
+olen
+oli
+oma
+on
+pole
+sa
+seda
+see
+selle
+siin
+siis
+ta
+te
+ära
\ No newline at end of file
--- /dev/null
+al
+anitz
+arabera
+asko
+baina
+bat
+batean
+batek
+bati
+batzuei
+batzuek
+batzuetan
+batzuk
+bera
+beraiek
+berau
+berauek
+bere
+berori
+beroriek
+beste
+bezala
+da
+dago
+dira
+ditu
+du
+dute
+edo
+egin
+ere
+eta
+eurak
+ez
+gainera
+gu
+gutxi
+guzti
+haiei
+haiek
+haietan
+hainbeste
+hala
+han
+handik
+hango
+hara
+hari
+hark
+hartan
+hau
+hauei
+hauek
+hauetan
+hemen
+hemendik
+hemengo
+hi
+hona
+honek
+honela
+honetan
+honi
+hor
+hori
+horiei
+horiek
+horietan
+horko
+horra
+horrek
+horrela
+horretan
+horri
+hortik
+hura
+izan
+ni
+noiz
+nola
+non
+nondik
+nongo
+nor
+nora
+ze
+zein
+zen
+zenbait
+zenbat
+zer
+zergatik
+ziren
+zituen
+zu
+zuek
+zuen
+zuten
\ No newline at end of file
--- /dev/null
+!
+,
+.
+:
+;
+،
+؛
+؟
+آباد
+آره
+آری
+آمد
+آمده
+آن
+آنان
+آنجا
+آنطور
+آنقدر
+آنكه
+آنها
+آنچه
+آنکه
+آورد
+آورده
+آيد
+آی
+آیا
+آیند
+اتفاقا
+اثرِ
+احتراما
+احتمالا
+اخیر
+اری
+از
+ازجمله
+اساسا
+است
+استفاد
+استفاده
+اش
+اشکارا
+اصلا
+اصولا
+اعلام
+اغلب
+اكنون
+الان
+البته
+البتّه
+ام
+اما
+امروز
+امروزه
+امسال
+امشب
+امور
+ان
+انجام
+اند
+انشاالله
+انصافا
+انطور
+انقدر
+انها
+انچنان
+انکه
+انگار
+او
+اول
+اولا
+اي
+ايشان
+ايم
+اين
+اينكه
+اکثرا
+اکنون
+اگر
+ای
+ایا
+اید
+ایشان
+ایم
+این
+اینجا
+ایند
+اینطور
+اینقدر
+اینها
+اینچنین
+اینک
+اینکه
+اینگونه
+با
+بار
+بارة
+باره
+بارها
+باز
+بازهم
+باش
+باشد
+باشم
+باشند
+باشيم
+باشی
+باشید
+باشیم
+بالا
+بالاخره
+بالایِ
+بالطبع
+بايد
+باید
+بتوان
+بتواند
+بتوانی
+بتوانیم
+بخش
+بخشی
+بخواه
+بخواهد
+بخواهم
+بخواهند
+بخواهی
+بخواهید
+بخواهیم
+بد
+بدون
+بر
+برابر
+برابرِ
+براحتی
+براساس
+براستی
+براي
+برای
+برایِ
+برخوردار
+برخي
+برخی
+برداري
+برعکس
+بروز
+بزرگ
+بزودی
+بسا
+بسيار
+بسياري
+بسیار
+بسیاری
+بطور
+بعد
+بعدا
+بعدها
+بعری
+بعضا
+بعضي
+بلافاصله
+بلكه
+بله
+بلکه
+بلی
+بنابراين
+بنابراین
+بندي
+به
+بهتر
+بهترين
+بود
+بودم
+بودن
+بودند
+بوده
+بودی
+بودید
+بودیم
+بویژه
+بي
+بيست
+بيش
+بيشتر
+بيشتري
+بين
+بکن
+بکند
+بکنم
+بکنند
+بکنی
+بکنید
+بکنیم
+بگو
+بگوید
+بگویم
+بگویند
+بگویی
+بگویید
+بگوییم
+بگیر
+بگیرد
+بگیرم
+بگیرند
+بگیری
+بگیرید
+بگیریم
+بی
+بیا
+بیاب
+بیابد
+بیابم
+بیابند
+بیابی
+بیابید
+بیابیم
+بیاور
+بیاورد
+بیاورم
+بیاورند
+بیاوری
+بیاورید
+بیاوریم
+بیاید
+بیایم
+بیایند
+بیایی
+بیایید
+بیاییم
+بیرون
+بیرونِ
+بیش
+بیشتر
+بیشتری
+بین
+ت
+تا
+تازه
+تاكنون
+تان
+تاکنون
+تحت
+تر
+تر براساس
+ترين
+تقریبا
+تلویحا
+تمام
+تماما
+تمامي
+تنها
+تو
+تواند
+توانست
+توانستم
+توانستن
+توانستند
+توانسته
+توانستی
+توانستیم
+توانم
+توانند
+توانی
+توانید
+توانیم
+توسط
+تولِ
+تویِ
+ثانیا
+جا
+جاي
+جايي
+جای
+جدا
+جديد
+جدید
+جريان
+جریان
+جز
+جلوگيري
+جلویِ
+جمعا
+جناح
+جهت
+حاضر
+حال
+حالا
+حتما
+حتي
+حتی
+حداکثر
+حدودا
+حدودِ
+حق
+خارجِ
+خب
+خدمات
+خصوصا
+خلاصه
+خواست
+خواستم
+خواستن
+خواستند
+خواسته
+خواستی
+خواستید
+خواستیم
+خواهد
+خواهم
+خواهند
+خواهيم
+خواهی
+خواهید
+خواهیم
+خوب
+خود
+خودت
+خودتان
+خودش
+خودشان
+خودم
+خودمان
+خوشبختانه
+خويش
+خویش
+خویشتن
+خیاه
+خیر
+خیلی
+داد
+دادم
+دادن
+دادند
+داده
+دادی
+دادید
+دادیم
+دار
+دارد
+دارم
+دارند
+داريم
+داری
+دارید
+داریم
+داشت
+داشتم
+داشتن
+داشتند
+داشته
+داشتی
+داشتید
+داشتیم
+دانست
+دانند
+دایم
+دایما
+در
+درباره
+درمجموع
+درون
+دریغ
+دقیقا
+دنبالِ
+ده
+دهد
+دهم
+دهند
+دهی
+دهید
+دهیم
+دو
+دوباره
+دوم
+ديده
+ديروز
+ديگر
+ديگران
+ديگري
+دیر
+دیروز
+دیگر
+دیگران
+دیگری
+را
+راحت
+راسا
+راستی
+راه
+رسما
+رسید
+رفت
+رفته
+رو
+روب
+روز
+روزانه
+روزهاي
+روي
+روی
+رویِ
+ريزي
+زمان
+زمانی
+زمینه
+زود
+زياد
+زير
+زيرا
+زیر
+زیرِ
+سابق
+ساخته
+سازي
+سالانه
+سالیانه
+سایر
+سراسر
+سرانجام
+سریعا
+سریِ
+سعي
+سمتِ
+سوم
+سوي
+سوی
+سویِ
+سپس
+شان
+شايد
+شاید
+شخصا
+شد
+شدم
+شدن
+شدند
+شده
+شدی
+شدید
+شدیدا
+شدیم
+شش
+شش نداشته
+شما
+شناسي
+شود
+شوم
+شوند
+شونده
+شوی
+شوید
+شویم
+صرفا
+صورت
+ضدِّ
+ضدِّ
+ضمن
+طبعا
+طبقِ
+طبیعتا
+طرف
+طريق
+طریق
+طور
+طي
+طی
+ظاهرا
+عدم
+عقبِ
+علّتِ
+علیه
+عمدا
+عمدتا
+عمل
+عملا
+عنوان
+عنوانِ
+غالبا
+غير
+غیر
+فردا
+فعلا
+فقط
+فكر
+فوق
+قابل
+قبل
+قبلا
+قدری
+قصدِ
+قطعا
+كرد
+كردم
+كردن
+كردند
+كرده
+كسي
+كل
+كمتر
+كند
+كنم
+كنند
+كنيد
+كنيم
+كه
+لااقل
+لطفا
+لطفاً
+ما
+مان
+مانند
+مانندِ
+مبادا
+متاسفانه
+متعاقبا
+مثل
+مثلا
+مثلِ
+مجانی
+مجددا
+مجموعا
+مختلف
+مدام
+مدت
+مدّتی
+مردم
+مرسی
+مستقیما
+مسلما
+مطمینا
+معمولا
+مقابل
+ممکن
+من
+موارد
+مورد
+موقتا
+مي
+ميليارد
+ميليون
+مگر
+می
+می شود
+میان
+میرسد
+میرود
+میشود
+میکنیم
+ناشي
+نام
+ناگاه
+ناگهان
+ناگهانی
+نبايد
+نباید
+نبود
+نخست
+نخستين
+نخواهد
+نخواهم
+نخواهند
+نخواهی
+نخواهید
+نخواهیم
+ندارد
+ندارم
+ندارند
+نداری
+ندارید
+نداریم
+نداشت
+نداشتم
+نداشتند
+نداشته
+نداشتی
+نداشتید
+نداشتیم
+نزديك
+نزدِ
+نزدیکِ
+نسبتا
+نشان
+نشده
+نظير
+نظیر
+نكرده
+نمايد
+نمي
+نمی
+نمیشود
+نه
+نهایتا
+نوع
+نوعي
+نوعی
+نيز
+نيست
+نگاه
+نیز
+نیست
+ها
+هاي
+هايي
+های
+هایی
+هبچ
+هر
+هرچه
+هرگز
+هزار
+هست
+هستم
+هستند
+هستيم
+هستی
+هستید
+هستیم
+هفت
+هم
+همان
+همه
+همواره
+همين
+همچنان
+همچنين
+همچنین
+همچون
+همیشه
+همین
+هنوز
+هنگام
+هنگامِ
+هنگامی
+هيچ
+هیچ
+هیچگاه
+و
+واقعا
+واقعی
+وجود
+وسطِ
+وضع
+وقتي
+وقتی
+وقتیکه
+ولی
+وي
+وگو
+وی
+ویژه
+يا
+يابد
+يك
+يكديگر
+يكي
+ّه
+٪
+پارسال
+پاعینِ
+پس
+پنج
+پيش
+پیدا
+پیش
+پیشاپیش
+پیشتر
+پیشِ
+چرا
+چطور
+چقدر
+چنان
+چنانچه
+چنانکه
+چند
+چندین
+چنين
+چنین
+چه
+چهار
+چو
+چون
+چيزي
+چگونه
+چیز
+چیزی
+چیست
+کاش
+کامل
+کاملا
+کتبا
+کجا
+کجاست
+کدام
+کرد
+کردم
+کردن
+کردند
+کرده
+کردی
+کردید
+کردیم
+کس
+کسانی
+کسی
+کل
+کلا
+کم
+کماکان
+کمتر
+کمتری
+کمی
+کن
+کنار
+کنارِ
+کند
+کنم
+کنند
+کننده
+کنون
+کنونی
+کنی
+کنید
+کنیم
+که
+کو
+کَی
+کی
+گاه
+گاهی
+گذاري
+گذاشته
+گذشته
+گردد
+گرفت
+گرفتم
+گرفتن
+گرفتند
+گرفته
+گرفتی
+گرفتید
+گرفتیم
+گروهي
+گفت
+گفتم
+گفتن
+گفتند
+گفته
+گفتی
+گفتید
+گفتیم
+گه
+گهگاه
+گو
+گويد
+گويند
+گویا
+گوید
+گویم
+گویند
+گویی
+گویید
+گوییم
+گيرد
+گيري
+گیرد
+گیرم
+گیرند
+گیری
+گیرید
+گیریم
+ی
+یا
+یابد
+یابم
+یابند
+یابی
+یابید
+یابیم
+یافت
+یافتم
+یافتن
+یافته
+یافتی
+یافتید
+یافتیم
+یعنی
+یقینا
+یه
+یک
+یکی
+۰
+۱
+۲
+۳
+۴
+۵
+۶
+۷
+۸
+۹
\ No newline at end of file
--- /dev/null
+aiemmin
+aika
+aikaa
+aikaan
+aikaisemmin
+aikaisin
+aikajen
+aikana
+aikoina
+aikoo
+aikovat
+aina
+ainakaan
+ainakin
+ainoa
+ainoat
+aiomme
+aion
+aiotte
+aist
+aivan
+ajan
+alas
+alemmas
+alkuisin
+alkuun
+alla
+alle
+aloitamme
+aloitan
+aloitat
+aloitatte
+aloitattivat
+aloitettava
+aloitettevaksi
+aloitettu
+aloitimme
+aloitin
+aloitit
+aloititte
+aloittaa
+aloittamatta
+aloitti
+aloittivat
+alta
+aluksi
+alussa
+alusta
+annettavaksi
+annetteva
+annettu
+ansiosta
+antaa
+antamatta
+antoi
+aoua
+apu
+asia
+asiaa
+asian
+asiasta
+asiat
+asioiden
+asioihin
+asioita
+asti
+avuksi
+avulla
+avun
+avutta
+edelle
+edelleen
+edellä
+edeltä
+edemmäs
+edes
+edessä
+edestä
+ehkä
+ei
+eikä
+eilen
+eivät
+eli
+ellei
+elleivät
+ellemme
+ellen
+ellet
+ellette
+emme
+en
+enemmän
+eniten
+ennen
+ensi
+ensimmäinen
+ensimmäiseksi
+ensimmäisen
+ensimmäisenä
+ensimmäiset
+ensimmäisiksi
+ensimmäisinä
+ensimmäisiä
+ensimmäistä
+ensin
+entinen
+entisen
+entisiä
+entisten
+entistä
+enää
+eri
+erittäin
+erityisesti
+eräiden
+eräs
+eräät
+esi
+esiin
+esillä
+esimerkiksi
+et
+eteen
+etenkin
+etessa
+ette
+ettei
+että
+haikki
+halua
+haluaa
+haluamatta
+haluamme
+haluan
+haluat
+haluatte
+haluavat
+halunnut
+halusi
+halusimme
+halusin
+halusit
+halusitte
+halusivat
+halutessa
+haluton
+he
+hei
+heidän
+heidät
+heihin
+heille
+heillä
+heiltä
+heissä
+heistä
+heitä
+helposti
+heti
+hetkellä
+hieman
+hitaasti
+hoikein
+huolimatta
+huomenna
+hyvien
+hyviin
+hyviksi
+hyville
+hyviltä
+hyvin
+hyvinä
+hyvissä
+hyvistä
+hyviä
+hyvä
+hyvät
+hyvää
+hän
+häneen
+hänelle
+hänellä
+häneltä
+hänen
+hänessä
+hänestä
+hänet
+häntä
+ihan
+ilman
+ilmeisesti
+itse
+itsensä
+itseään
+ja
+jo
+johon
+joiden
+joihin
+joiksi
+joilla
+joille
+joilta
+joina
+joissa
+joista
+joita
+joka
+jokainen
+jokin
+joko
+joksi
+joku
+jolla
+jolle
+jolloin
+jolta
+jompikumpi
+jona
+jonka
+jonkin
+jonne
+joo
+jopa
+jos
+joskus
+jossa
+josta
+jota
+jotain
+joten
+jotenkin
+jotenkuten
+jotka
+jotta
+jouduimme
+jouduin
+jouduit
+jouduitte
+joudumme
+joudun
+joudutte
+joukkoon
+joukossa
+joukosta
+joutua
+joutui
+joutuivat
+joutumaan
+joutuu
+joutuvat
+juuri
+jälkeen
+jälleen
+jää
+kahdeksan
+kahdeksannen
+kahdella
+kahdelle
+kahdelta
+kahden
+kahdessa
+kahdesta
+kahta
+kahteen
+kai
+kaiken
+kaikille
+kaikilta
+kaikkea
+kaikki
+kaikkia
+kaikkiaan
+kaikkialla
+kaikkialle
+kaikkialta
+kaikkien
+kaikkin
+kaksi
+kannalta
+kannattaa
+kanssa
+kanssaan
+kanssamme
+kanssani
+kanssanne
+kanssasi
+kauan
+kauemmas
+kaukana
+kautta
+kehen
+keiden
+keihin
+keiksi
+keille
+keillä
+keiltä
+keinä
+keissä
+keistä
+keitten
+keittä
+keitä
+keneen
+keneksi
+kenelle
+kenellä
+keneltä
+kenen
+kenenä
+kenessä
+kenestä
+kenet
+kenettä
+kennessästä
+kenties
+kerran
+kerta
+kertaa
+keskellä
+kesken
+keskimäärin
+ketkä
+ketä
+kiitos
+kohti
+koko
+kokonaan
+kolmas
+kolme
+kolmen
+kolmesti
+koska
+koskaan
+kovin
+kuin
+kuinka
+kuinkan
+kuitenkaan
+kuitenkin
+kuka
+kukaan
+kukin
+kukka
+kumpainen
+kumpainenkaan
+kumpi
+kumpikaan
+kumpikin
+kun
+kuten
+kuuden
+kuusi
+kuutta
+kylliksi
+kyllä
+kymmenen
+kyse
+liian
+liki
+lisäksi
+lisää
+lla
+luo
+luona
+lähekkäin
+lähelle
+lähellä
+läheltä
+lähemmäs
+lähes
+lähinnä
+lähtien
+läpi
+mahdollisimman
+mahdollista
+me
+meidän
+meidät
+meihin
+meille
+meillä
+meiltä
+meissä
+meistä
+meitä
+melkein
+melko
+menee
+meneet
+menemme
+menen
+menet
+menette
+menevät
+meni
+menimme
+menin
+menit
+menivät
+mennessä
+mennyt
+menossa
+mihin
+mikin
+miksi
+mikä
+mikäli
+mikään
+mille
+milloin
+milloinkan
+millä
+miltä
+minkä
+minne
+minua
+minulla
+minulle
+minulta
+minun
+minussa
+minusta
+minut
+minuun
+minä
+missä
+mistä
+miten
+mitkä
+mitä
+mitään
+moi
+molemmat
+mones
+monesti
+monet
+moni
+moniaalla
+moniaalle
+moniaalta
+monta
+muassa
+muiden
+muita
+muka
+mukaan
+mukaansa
+mukana
+mutta
+muu
+muualla
+muualle
+muualta
+muuanne
+muulloin
+muun
+muut
+muuta
+muutama
+muutaman
+muuten
+myöhemmin
+myös
+myöskin
+myöskään
+myötä
+ne
+neljä
+neljän
+neljää
+niiden
+niihin
+niiksi
+niille
+niillä
+niiltä
+niin
+niinä
+niissä
+niistä
+niitä
+noiden
+noihin
+noiksi
+noilla
+noille
+noilta
+noin
+noina
+noissa
+noista
+noita
+nopeammin
+nopeasti
+nopeiten
+nro
+nuo
+nyt
+näiden
+näihin
+näiksi
+näille
+näillä
+näiltä
+näin
+näinä
+näissä
+näissähin
+näissälle
+näissältä
+näissästä
+näistä
+näitä
+nämä
+ohi
+oikea
+oikealla
+oikein
+ole
+olemme
+olen
+olet
+olette
+oleva
+olevan
+olevat
+oli
+olimme
+olin
+olisi
+olisimme
+olisin
+olisit
+olisitte
+olisivat
+olit
+olitte
+olivat
+olla
+olleet
+olli
+ollut
+oma
+omaa
+omaan
+omaksi
+omalle
+omalta
+oman
+omassa
+omat
+omia
+omien
+omiin
+omiksi
+omille
+omilta
+omissa
+omista
+on
+onkin
+onko
+ovat
+paikoittain
+paitsi
+pakosti
+paljon
+paremmin
+parempi
+parhaillaan
+parhaiten
+perusteella
+peräti
+pian
+pieneen
+pieneksi
+pienelle
+pienellä
+pieneltä
+pienempi
+pienestä
+pieni
+pienin
+poikki
+puolesta
+puolestaan
+päälle
+runsaasti
+saakka
+sadam
+sama
+samaa
+samaan
+samalla
+samallalta
+samallassa
+samallasta
+saman
+samat
+samoin
+sata
+sataa
+satojen
+se
+seitsemän
+sekä
+sen
+seuraavat
+siellä
+sieltä
+siihen
+siinä
+siis
+siitä
+sijaan
+siksi
+sille
+silloin
+sillä
+silti
+siltä
+sinne
+sinua
+sinulla
+sinulle
+sinulta
+sinun
+sinussa
+sinusta
+sinut
+sinuun
+sinä
+sisäkkäin
+sisällä
+siten
+sitten
+sitä
+ssa
+sta
+suoraan
+suuntaan
+suuren
+suuret
+suuri
+suuria
+suurin
+suurten
+taa
+taas
+taemmas
+tahansa
+tai
+takaa
+takaisin
+takana
+takia
+tallä
+tapauksessa
+tarpeeksi
+tavalla
+tavoitteena
+te
+teidän
+teidät
+teihin
+teille
+teillä
+teiltä
+teissä
+teistä
+teitä
+tietysti
+todella
+toinen
+toisaalla
+toisaalle
+toisaalta
+toiseen
+toiseksi
+toisella
+toiselle
+toiselta
+toisemme
+toisen
+toisensa
+toisessa
+toisesta
+toista
+toistaiseksi
+toki
+tosin
+tuhannen
+tuhat
+tule
+tulee
+tulemme
+tulen
+tulet
+tulette
+tulevat
+tulimme
+tulin
+tulisi
+tulisimme
+tulisin
+tulisit
+tulisitte
+tulisivat
+tulit
+tulitte
+tulivat
+tulla
+tulleet
+tullut
+tuntuu
+tuo
+tuohon
+tuoksi
+tuolla
+tuolle
+tuolloin
+tuolta
+tuon
+tuona
+tuonne
+tuossa
+tuosta
+tuota
+tuotä
+tuskin
+tykö
+tähän
+täksi
+tälle
+tällä
+tällöin
+tältä
+tämä
+tämän
+tänne
+tänä
+tänään
+tässä
+tästä
+täten
+tätä
+täysin
+täytyvät
+täytyy
+täällä
+täältä
+ulkopuolella
+usea
+useasti
+useimmiten
+usein
+useita
+uudeksi
+uudelleen
+uuden
+uudet
+uusi
+uusia
+uusien
+uusinta
+uuteen
+uutta
+vaan
+vahemmän
+vai
+vaiheessa
+vaikea
+vaikean
+vaikeat
+vaikeilla
+vaikeille
+vaikeilta
+vaikeissa
+vaikeista
+vaikka
+vain
+varmasti
+varsin
+varsinkin
+varten
+vasen
+vasenmalla
+vasta
+vastaan
+vastakkain
+vastan
+verran
+vielä
+vierekkäin
+vieressä
+vieri
+viiden
+viime
+viimeinen
+viimeisen
+viimeksi
+viisi
+voi
+voidaan
+voimme
+voin
+voisi
+voit
+voitte
+voivat
+vuoden
+vuoksi
+vuosi
+vuosien
+vuosina
+vuotta
+vähemmän
+vähintään
+vähiten
+vähän
+välillä
+yhdeksän
+yhden
+yhdessä
+yhteen
+yhteensä
+yhteydessä
+yhteyteen
+yhtä
+yhtäälle
+yhtäällä
+yhtäältä
+yhtään
+yhä
+yksi
+yksin
+yksittäin
+yleensä
+ylemmäs
+yli
+ylös
+ympäri
+älköön
+älä
\ No newline at end of file
--- /dev/null
+a
+abord
+absolument
+afin
+ah
+ai
+aie
+aient
+aies
+ailleurs
+ainsi
+ait
+allaient
+allo
+allons
+allô
+alors
+anterieur
+anterieure
+anterieures
+apres
+après
+as
+assez
+attendu
+au
+aucun
+aucune
+aucuns
+aujourd
+aujourd'hui
+aupres
+auquel
+aura
+aurai
+auraient
+aurais
+aurait
+auras
+aurez
+auriez
+aurions
+aurons
+auront
+aussi
+autant
+autre
+autrefois
+autrement
+autres
+autrui
+aux
+auxquelles
+auxquels
+avaient
+avais
+avait
+avant
+avec
+avez
+aviez
+avions
+avoir
+avons
+ayant
+ayez
+ayons
+b
+bah
+bas
+basee
+bat
+beau
+beaucoup
+bien
+bigre
+bon
+boum
+bravo
+brrr
+c
+car
+ce
+ceci
+cela
+celle
+celle-ci
+celle-là
+celles
+celles-ci
+celles-là
+celui
+celui-ci
+celui-là
+celà
+cent
+cependant
+certain
+certaine
+certaines
+certains
+certes
+ces
+cet
+cette
+ceux
+ceux-ci
+ceux-là
+chacun
+chacune
+chaque
+cher
+chers
+chez
+chiche
+chut
+chère
+chères
+ci
+cinq
+cinquantaine
+cinquante
+cinquantième
+cinquième
+clac
+clic
+combien
+comme
+comment
+comparable
+comparables
+compris
+concernant
+contre
+couic
+crac
+d
+da
+dans
+de
+debout
+dedans
+dehors
+deja
+delà
+depuis
+dernier
+derniere
+derriere
+derrière
+des
+desormais
+desquelles
+desquels
+dessous
+dessus
+deux
+deuxième
+deuxièmement
+devant
+devers
+devra
+devrait
+different
+differentes
+differents
+différent
+différente
+différentes
+différents
+dire
+directe
+directement
+dit
+dite
+dits
+divers
+diverse
+diverses
+dix
+dix-huit
+dix-neuf
+dix-sept
+dixième
+doit
+doivent
+donc
+dont
+dos
+douze
+douzième
+dring
+droite
+du
+duquel
+durant
+dès
+début
+désormais
+e
+effet
+egale
+egalement
+egales
+eh
+elle
+elle-même
+elles
+elles-mêmes
+en
+encore
+enfin
+entre
+envers
+environ
+es
+essai
+est
+et
+etant
+etc
+etre
+eu
+eue
+eues
+euh
+eurent
+eus
+eusse
+eussent
+eusses
+eussiez
+eussions
+eut
+eux
+eux-mêmes
+exactement
+excepté
+extenso
+exterieur
+eûmes
+eût
+eûtes
+f
+fais
+faisaient
+faisant
+fait
+faites
+façon
+feront
+fi
+flac
+floc
+fois
+font
+force
+furent
+fus
+fusse
+fussent
+fusses
+fussiez
+fussions
+fut
+fûmes
+fût
+fûtes
+g
+gens
+h
+ha
+haut
+hein
+hem
+hep
+hi
+ho
+holà
+hop
+hormis
+hors
+hou
+houp
+hue
+hui
+huit
+huitième
+hum
+hurrah
+hé
+hélas
+i
+ici
+il
+ils
+importe
+j
+je
+jusqu
+jusque
+juste
+k
+l
+la
+laisser
+laquelle
+las
+le
+lequel
+les
+lesquelles
+lesquels
+leur
+leurs
+longtemps
+lors
+lorsque
+lui
+lui-meme
+lui-même
+là
+lès
+m
+ma
+maint
+maintenant
+mais
+malgre
+malgré
+maximale
+me
+meme
+memes
+merci
+mes
+mien
+mienne
+miennes
+miens
+mille
+mince
+mine
+minimale
+moi
+moi-meme
+moi-même
+moindres
+moins
+mon
+mot
+moyennant
+multiple
+multiples
+même
+mêmes
+n
+na
+naturel
+naturelle
+naturelles
+ne
+neanmoins
+necessaire
+necessairement
+neuf
+neuvième
+ni
+nombreuses
+nombreux
+nommés
+non
+nos
+notamment
+notre
+nous
+nous-mêmes
+nouveau
+nouveaux
+nul
+néanmoins
+nôtre
+nôtres
+o
+oh
+ohé
+ollé
+olé
+on
+ont
+onze
+onzième
+ore
+ou
+ouf
+ouias
+oust
+ouste
+outre
+ouvert
+ouverte
+ouverts
+o|
+où
+p
+paf
+pan
+par
+parce
+parfois
+parle
+parlent
+parler
+parmi
+parole
+parseme
+partant
+particulier
+particulière
+particulièrement
+pas
+passé
+pendant
+pense
+permet
+personne
+personnes
+peu
+peut
+peuvent
+peux
+pff
+pfft
+pfut
+pif
+pire
+pièce
+plein
+plouf
+plupart
+plus
+plusieurs
+plutôt
+possessif
+possessifs
+possible
+possibles
+pouah
+pour
+pourquoi
+pourrais
+pourrait
+pouvait
+prealable
+precisement
+premier
+première
+premièrement
+pres
+probable
+probante
+procedant
+proche
+près
+psitt
+pu
+puis
+puisque
+pur
+pure
+q
+qu
+quand
+quant
+quant-à-soi
+quanta
+quarante
+quatorze
+quatre
+quatre-vingt
+quatrième
+quatrièmement
+que
+quel
+quelconque
+quelle
+quelles
+quelqu'un
+quelque
+quelques
+quels
+qui
+quiconque
+quinze
+quoi
+quoique
+r
+rare
+rarement
+rares
+relative
+relativement
+remarquable
+rend
+rendre
+restant
+reste
+restent
+restrictif
+retour
+revoici
+revoilà
+rien
+s
+sa
+sacrebleu
+sait
+sans
+sapristi
+sauf
+se
+sein
+seize
+selon
+semblable
+semblaient
+semble
+semblent
+sent
+sept
+septième
+sera
+serai
+seraient
+serais
+serait
+seras
+serez
+seriez
+serions
+serons
+seront
+ses
+seul
+seule
+seulement
+si
+sien
+sienne
+siennes
+siens
+sinon
+six
+sixième
+soi
+soi-même
+soient
+sois
+soit
+soixante
+sommes
+son
+sont
+sous
+souvent
+soyez
+soyons
+specifique
+specifiques
+speculatif
+stop
+strictement
+subtiles
+suffisant
+suffisante
+suffit
+suis
+suit
+suivant
+suivante
+suivantes
+suivants
+suivre
+sujet
+superpose
+sur
+surtout
+t
+ta
+tac
+tandis
+tant
+tardive
+te
+tel
+telle
+tellement
+telles
+tels
+tenant
+tend
+tenir
+tente
+tes
+tic
+tien
+tienne
+tiennes
+tiens
+toc
+toi
+toi-même
+ton
+touchant
+toujours
+tous
+tout
+toute
+toutefois
+toutes
+treize
+trente
+tres
+trois
+troisième
+troisièmement
+trop
+très
+tsoin
+tsouin
+tu
+té
+u
+un
+une
+unes
+uniformement
+unique
+uniques
+uns
+v
+va
+vais
+valeur
+vas
+vers
+via
+vif
+vifs
+vingt
+vivat
+vive
+vives
+vlan
+voici
+voie
+voient
+voilà
+voire
+vont
+vos
+votre
+vous
+vous-mêmes
+vu
+vé
+vôtre
+vôtres
+w
+x
+y
+z
+zut
+à
+â
+ça
+ès
+étaient
+étais
+était
+étant
+état
+étiez
+étions
+été
+étée
+étées
+étés
+êtes
+être
+ô
\ No newline at end of file
--- /dev/null
+a
+ach
+ag
+agus
+an
+aon
+ar
+arna
+as
+b'
+ba
+beirt
+bhúr
+caoga
+ceathair
+ceathrar
+chomh
+chtó
+chuig
+chun
+cois
+céad
+cúig
+cúigear
+d'
+daichead
+dar
+de
+deich
+deichniúr
+den
+dhá
+do
+don
+dtí
+dá
+dár
+dó
+faoi
+faoin
+faoina
+faoinár
+fara
+fiche
+gach
+gan
+go
+gur
+haon
+hocht
+i
+iad
+idir
+in
+ina
+ins
+inár
+is
+le
+leis
+lena
+lenár
+m'
+mar
+mo
+mé
+na
+nach
+naoi
+naonúr
+ná
+ní
+níor
+nó
+nócha
+ocht
+ochtar
+os
+roimh
+sa
+seacht
+seachtar
+seachtó
+seasca
+seisear
+siad
+sibh
+sinn
+sna
+sé
+sí
+tar
+thar
+thú
+triúr
+trí
+trína
+trínár
+tríocha
+tú
+um
+ár
+é
+éis
+í
+ó
+ón
+óna
+ónár
\ No newline at end of file
--- /dev/null
+a
+alí
+ao
+aos
+aquel
+aquela
+aquelas
+aqueles
+aquilo
+aquí
+as
+así
+aínda
+ben
+cando
+che
+co
+coa
+coas
+comigo
+con
+connosco
+contigo
+convosco
+cos
+cun
+cunha
+cunhas
+cuns
+da
+dalgunha
+dalgunhas
+dalgún
+dalgúns
+das
+de
+del
+dela
+delas
+deles
+desde
+deste
+do
+dos
+dun
+dunha
+dunhas
+duns
+e
+el
+ela
+elas
+eles
+en
+era
+eran
+esa
+esas
+ese
+eses
+esta
+estaba
+estar
+este
+estes
+estiven
+estou
+está
+están
+eu
+facer
+foi
+foron
+fun
+había
+hai
+iso
+isto
+la
+las
+lle
+lles
+lo
+los
+mais
+me
+meu
+meus
+min
+miña
+miñas
+moi
+na
+nas
+neste
+nin
+no
+non
+nos
+nosa
+nosas
+noso
+nosos
+nun
+nunha
+nunhas
+nuns
+nós
+o
+os
+ou
+para
+pero
+pode
+pois
+pola
+polas
+polo
+polos
+por
+que
+se
+senón
+ser
+seu
+seus
+sexa
+sido
+sobre
+súa
+súas
+tamén
+tan
+te
+ten
+ter
+teu
+teus
+teñen
+teño
+ti
+tido
+tiven
+tiña
+túa
+túas
+un
+unha
+unhas
+uns
+vos
+vosa
+vosas
+voso
+vosos
+vós
+á
+é
+ó
+ós
\ No newline at end of file
--- /dev/null
+અંગે
+અંદર
+અથવા
+અને
+અમને
+અમારું
+અમે
+અહીં
+આ
+આગળ
+આથી
+આનું
+આને
+આપણને
+આપણું
+આપણે
+આપી
+આર
+આવી
+આવે
+ઉપર
+ઉભા
+ઊંચે
+ઊભું
+એ
+એક
+એન
+એના
+એનાં
+એની
+એનું
+એને
+એનો
+એમ
+એવા
+એવાં
+એવી
+એવું
+એવો
+ઓછું
+કંઈક
+કઈ
+કયું
+કયો
+કરતાં
+કરવું
+કરી
+કરીએ
+કરું
+કરે
+કરેલું
+કર્યા
+કર્યાં
+કર્યું
+કર્યો
+કાંઈ
+કે
+કેટલું
+કેમ
+કેવી
+કેવું
+કોઈ
+કોઈક
+કોણ
+કોણે
+કોને
+ક્યાં
+ક્યારે
+ખૂબ
+ગઈ
+ગયા
+ગયાં
+ગયું
+ગયો
+ઘણું
+છ
+છતાં
+છીએ
+છું
+છે
+છેક
+છો
+જ
+જાય
+જી
+જે
+જેટલું
+જેને
+જેમ
+જેવી
+જેવું
+જેવો
+જો
+જોઈએ
+જ્યાં
+જ્યારે
+ઝાઝું
+તને
+તમને
+તમારું
+તમે
+તા
+તારાથી
+તારામાં
+તારું
+તું
+તે
+તેં
+તેઓ
+તેણે
+તેથી
+તેના
+તેની
+તેનું
+તેને
+તેમ
+તેમનું
+તેમને
+તેવી
+તેવું
+તો
+ત્યાં
+ત્યારે
+થઇ
+થઈ
+થઈએ
+થતા
+થતાં
+થતી
+થતું
+થતો
+થયા
+થયાં
+થયું
+થયેલું
+થયો
+થવું
+થાઉં
+થાઓ
+થાય
+થી
+થોડું
+દરેક
+ન
+નં
+નં.
+નથી
+નહિ
+નહી
+નહીં
+ના
+ની
+નીચે
+નું
+ને
+નો
+પછી
+પણ
+પર
+પરંતુ
+પહેલાં
+પાછળ
+પાસે
+પોતાનું
+પ્રત્યેક
+ફક્ત
+ફરી
+ફરીથી
+બંને
+બધા
+બધું
+બની
+બહાર
+બહુ
+બાદ
+બે
+મને
+મા
+માં
+માટે
+માત્ર
+મારું
+મી
+મૂકવું
+મૂકી
+મૂક્યા
+મૂક્યાં
+મૂક્યું
+મેં
+રહી
+રહે
+રહેવું
+રહ્યા
+રહ્યાં
+રહ્યો
+રીતે
+રૂ.
+રૂા
+લેતા
+લેતું
+લેવા
+વગેરે
+વધુ
+શકે
+શા
+શું
+સરખું
+સામે
+સુધી
+હતા
+હતાં
+હતી
+હતું
+હવે
+હશે
+હશો
+હા
+હું
+હો
+હોઈ
+હોઈશ
+હોઈશું
+હોય
+હોવા
\ No newline at end of file
--- /dev/null
+a
+amma
+ba
+ban
+ce
+cikin
+da
+don
+ga
+in
+ina
+ita
+ji
+ka
+ko
+kuma
+lokacin
+ma
+mai
+na
+ne
+ni
+sai
+shi
+su
+suka
+sun
+ta
+tafi
+take
+tana
+wani
+wannan
+wata
+ya
+yake
+yana
+yi
+za
\ No newline at end of file
--- /dev/null
+אבל
+או
+אולי
+אותה
+אותו
+אותי
+אותך
+אותם
+אותן
+אותנו
+אז
+אחר
+אחרות
+אחרי
+אחריכן
+אחרים
+אחרת
+אי
+איזה
+איך
+אין
+איפה
+איתה
+איתו
+איתי
+איתך
+איתכם
+איתכן
+איתם
+איתן
+איתנו
+אך
+אל
+אלה
+אלו
+אם
+אנחנו
+אני
+אס
+אף
+אצל
+אשר
+את
+אתה
+אתכם
+אתכן
+אתם
+אתן
+באיזומידה
+באמצע
+באמצעות
+בגלל
+בין
+בלי
+במידה
+במקוםשבו
+ברם
+בשביל
+בשעהש
+בתוך
+גם
+דרך
+הוא
+היא
+היה
+היכן
+היתה
+היתי
+הם
+הן
+הנה
+הסיבהשבגללה
+הרי
+ואילו
+ואת
+זאת
+זה
+זות
+יהיה
+יוכל
+יוכלו
+יותרמדי
+יכול
+יכולה
+יכולות
+יכולים
+יכל
+יכלה
+יכלו
+יש
+כאן
+כאשר
+כולם
+כולן
+כזה
+כי
+כיצד
+כך
+ככה
+כל
+כלל
+כמו
+כן
+כפי
+כש
+לא
+לאו
+לאיזותכלית
+לאן
+לבין
+לה
+להיות
+להם
+להן
+לו
+לי
+לכם
+לכן
+למה
+למטה
+למעלה
+למקוםשבו
+למרות
+לנו
+לעבר
+לעיכן
+לפיכך
+לפני
+מאד
+מאחורי
+מאיזוסיבה
+מאין
+מאיפה
+מבלי
+מבעד
+מדוע
+מה
+מהיכן
+מול
+מחוץ
+מי
+מכאן
+מכיוון
+מלבד
+מן
+מנין
+מסוגל
+מעט
+מעטים
+מעל
+מצד
+מקוםבו
+מתחת
+מתי
+נגד
+נגר
+נו
+עד
+עז
+על
+עלי
+עליה
+עליהם
+עליהן
+עליו
+עליך
+עליכם
+עלינו
+עם
+עצמה
+עצמהם
+עצמהן
+עצמו
+עצמי
+עצמם
+עצמן
+עצמנו
+פה
+רק
+שוב
+של
+שלה
+שלהם
+שלהן
+שלו
+שלי
+שלך
+שלכה
+שלכם
+שלכן
+שלנו
+שם
+תהיה
+תחת
\ No newline at end of file
--- /dev/null
+अंदर
+अत
+अदि
+अप
+अपना
+अपनि
+अपनी
+अपने
+अभि
+अभी
+आदि
+आप
+इंहिं
+इंहें
+इंहों
+इतयादि
+इत्यादि
+इन
+इनका
+इन्हीं
+इन्हें
+इन्हों
+इस
+इसका
+इसकि
+इसकी
+इसके
+इसमें
+इसि
+इसी
+इसे
+उंहिं
+उंहें
+उंहों
+उन
+उनका
+उनकि
+उनकी
+उनके
+उनको
+उन्हीं
+उन्हें
+उन्हों
+उस
+उसके
+उसि
+उसी
+उसे
+एक
+एवं
+एस
+एसे
+ऐसे
+ओर
+और
+कइ
+कई
+कर
+करता
+करते
+करना
+करने
+करें
+कहते
+कहा
+का
+काफि
+काफ़ी
+कि
+किंहें
+किंहों
+कितना
+किन्हें
+किन्हों
+किया
+किर
+किस
+किसि
+किसी
+किसे
+की
+कुछ
+कुल
+के
+को
+कोइ
+कोई
+कोन
+कोनसा
+कौन
+कौनसा
+गया
+घर
+जब
+जहाँ
+जहां
+जा
+जिंहें
+जिंहों
+जितना
+जिधर
+जिन
+जिन्हें
+जिन्हों
+जिस
+जिसे
+जीधर
+जेसा
+जेसे
+जैसा
+जैसे
+जो
+तक
+तब
+तरह
+तिंहें
+तिंहों
+तिन
+तिन्हें
+तिन्हों
+तिस
+तिसे
+तो
+था
+थि
+थी
+थे
+दबारा
+दवारा
+दिया
+दुसरा
+दुसरे
+दूसरे
+दो
+द्वारा
+न
+नहिं
+नहीं
+ना
+निचे
+निहायत
+नीचे
+ने
+पर
+पहले
+पुरा
+पूरा
+पे
+फिर
+बनि
+बनी
+बहि
+बही
+बहुत
+बाद
+बाला
+बिलकुल
+भि
+भितर
+भी
+भीतर
+मगर
+मानो
+मे
+में
+यदि
+यह
+यहाँ
+यहां
+यहि
+यही
+या
+यिह
+ये
+रखें
+रवासा
+रहा
+रहे
+ऱ्वासा
+लिए
+लिये
+लेकिन
+व
+वगेरह
+वरग
+वर्ग
+वह
+वहाँ
+वहां
+वहिं
+वहीं
+वाले
+वुह
+वे
+वग़ैरह
+संग
+सकता
+सकते
+सबसे
+सभि
+सभी
+साथ
+साबुत
+साभ
+सारा
+से
+सो
+हि
+ही
+हुअ
+हुआ
+हुइ
+हुई
+हुए
+हे
+हें
+है
+हैं
+हो
+होता
+होति
+होती
+होते
+होना
+होने
\ No newline at end of file
--- /dev/null
+a
+ako
+ali
+bi
+bih
+bila
+bili
+bilo
+bio
+bismo
+biste
+biti
+bumo
+da
+do
+duž
+ga
+hoće
+hoćemo
+hoćete
+hoćeš
+hoću
+i
+iako
+ih
+ili
+iz
+ja
+je
+jedna
+jedne
+jedno
+jer
+jesam
+jesi
+jesmo
+jest
+jeste
+jesu
+jim
+joj
+još
+ju
+kada
+kako
+kao
+koja
+koje
+koji
+kojima
+koju
+kroz
+li
+me
+mene
+meni
+mi
+mimo
+moj
+moja
+moje
+mu
+na
+nad
+nakon
+nam
+nama
+nas
+naš
+naša
+naše
+našeg
+ne
+nego
+neka
+neki
+nekog
+neku
+nema
+netko
+neće
+nećemo
+nećete
+nećeš
+neću
+nešto
+ni
+nije
+nikoga
+nikoje
+nikoju
+nisam
+nisi
+nismo
+niste
+nisu
+njega
+njegov
+njegova
+njegovo
+njemu
+njezin
+njezina
+njezino
+njih
+njihov
+njihova
+njihovo
+njim
+njima
+njoj
+nju
+no
+o
+od
+odmah
+on
+ona
+oni
+ono
+ova
+pa
+pak
+po
+pod
+pored
+prije
+s
+sa
+sam
+samo
+se
+sebe
+sebi
+si
+smo
+ste
+su
+sve
+svi
+svog
+svoj
+svoja
+svoje
+svom
+ta
+tada
+taj
+tako
+te
+tebe
+tebi
+ti
+to
+toj
+tome
+tu
+tvoj
+tvoja
+tvoje
+u
+uz
+vam
+vama
+vas
+vaš
+vaša
+vaše
+već
+vi
+vrlo
+za
+zar
+će
+ćemo
+ćete
+ćeš
+ću
+što
\ No newline at end of file
--- /dev/null
+a
+abba
+abban
+abból
+addig
+ahhoz
+ahogy
+ahol
+aki
+akik
+akkor
+akár
+alapján
+alatt
+alatta
+alattad
+alattam
+alattatok
+alattuk
+alattunk
+alá
+alád
+alájuk
+alám
+alánk
+alátok
+alól
+alóla
+alólad
+alólam
+alólatok
+alóluk
+alólunk
+amely
+amelybol
+amelyek
+amelyekben
+amelyeket
+amelyet
+amelyik
+amelynek
+ami
+amikor
+amit
+amolyan
+amott
+amíg
+annak
+annál
+arra
+arról
+attól
+az
+aznap
+azok
+azokat
+azokba
+azokban
+azokból
+azokhoz
+azokig
+azokkal
+azokká
+azoknak
+azoknál
+azokon
+azokra
+azokról
+azoktól
+azokért
+azon
+azonban
+azonnal
+azt
+aztán
+azután
+azzal
+azzá
+azért
+bal
+balra
+ban
+be
+belé
+beléd
+beléjük
+belém
+belénk
+belétek
+belül
+belőle
+belőled
+belőlem
+belőletek
+belőlük
+belőlünk
+ben
+benne
+benned
+bennem
+bennetek
+bennük
+bennünk
+bár
+bárcsak
+bármilyen
+búcsú
+cikk
+cikkek
+cikkeket
+csak
+csakhogy
+csupán
+de
+dehogy
+e
+ebbe
+ebben
+ebből
+eddig
+egy
+egyebek
+egyebet
+egyedül
+egyelőre
+egyes
+egyet
+egyetlen
+egyik
+egymás
+egyre
+egyszerre
+egyéb
+együtt
+egész
+egészen
+ehhez
+ekkor
+el
+eleinte
+ellen
+ellenes
+elleni
+ellenére
+elmondta
+elsõ
+első
+elsők
+elsősorban
+elsőt
+elé
+eléd
+elég
+eléjük
+elém
+elénk
+elétek
+elõ
+elõször
+elõtt
+elő
+előbb
+elől
+előle
+előled
+előlem
+előletek
+előlük
+előlünk
+először
+előtt
+előtte
+előtted
+előttem
+előttetek
+előttük
+előttünk
+előző
+emilyen
+engem
+ennek
+ennyi
+ennél
+enyém
+erre
+erről
+esetben
+ettől
+ez
+ezek
+ezekbe
+ezekben
+ezekből
+ezeken
+ezeket
+ezekhez
+ezekig
+ezekkel
+ezekké
+ezeknek
+ezeknél
+ezekre
+ezekről
+ezektől
+ezekért
+ezen
+ezentúl
+ezer
+ezret
+ezt
+ezután
+ezzel
+ezzé
+ezért
+fel
+fele
+felek
+felet
+felett
+felé
+fent
+fenti
+fél
+fölé
+gyakran
+ha
+halló
+hamar
+hanem
+harmadik
+harmadikat
+harminc
+hat
+hatodik
+hatodikat
+hatot
+hatvan
+helyett
+hetedik
+hetediket
+hetet
+hetven
+hirtelen
+hiszen
+hiába
+hogy
+hogyan
+hol
+holnap
+holnapot
+honnan
+hova
+hozzá
+hozzád
+hozzájuk
+hozzám
+hozzánk
+hozzátok
+hurrá
+huszadik
+hány
+hányszor
+hármat
+három
+hát
+hátha
+hátulsó
+hét
+húsz
+ide
+ide-оda
+idén
+igazán
+igen
+ill
+ill.
+illetve
+ilyen
+ilyenkor
+immár
+inkább
+is
+ismét
+ison
+itt
+jelenleg
+jobban
+jobbra
+jó
+jól
+jólesik
+jóval
+jövőre
+kell
+kellene
+kellett
+kelljen
+keressünk
+keresztül
+ketten
+kettő
+kettőt
+kevés
+ki
+kiben
+kiből
+kicsit
+kicsoda
+kihez
+kik
+kikbe
+kikben
+kikből
+kiken
+kiket
+kikhez
+kikkel
+kikké
+kiknek
+kiknél
+kikre
+kikről
+kiktől
+kikért
+kilenc
+kilencedik
+kilencediket
+kilencet
+kilencven
+kin
+kinek
+kinél
+kire
+kiről
+kit
+kitől
+kivel
+kivé
+kié
+kiért
+korábban
+képest
+kérem
+kérlek
+kész
+késő
+később
+későn
+két
+kétszer
+kívül
+körül
+köszönhetően
+köszönöm
+közben
+közel
+közepesen
+közepén
+közé
+között
+közül
+külön
+különben
+különböző
+különbözőbb
+különbözőek
+lassan
+le
+legalább
+legyen
+lehet
+lehetetlen
+lehetett
+lehetőleg
+lehetőség
+lenne
+lenni
+lennék
+lennének
+lesz
+leszek
+lesznek
+leszünk
+lett
+lettek
+lettem
+lettünk
+lévő
+ma
+maga
+magad
+magam
+magatokat
+magukat
+magunkat
+magát
+mai
+majd
+majdnem
+manapság
+meg
+megcsinál
+megcsinálnak
+megint
+megvan
+mellett
+mellette
+melletted
+mellettem
+mellettetek
+mellettük
+mellettünk
+mellé
+melléd
+melléjük
+mellém
+mellénk
+mellétek
+mellől
+mellőle
+mellőled
+mellőlem
+mellőletek
+mellőlük
+mellőlünk
+mely
+melyek
+melyik
+mennyi
+mert
+mi
+miatt
+miatta
+miattad
+miattam
+miattatok
+miattuk
+miattunk
+mibe
+miben
+miből
+mihez
+mik
+mikbe
+mikben
+mikből
+miken
+miket
+mikhez
+mikkel
+mikké
+miknek
+miknél
+mikor
+mikre
+mikről
+miktől
+mikért
+milyen
+min
+mind
+mindegyik
+mindegyiket
+minden
+mindenesetre
+mindenki
+mindent
+mindenütt
+mindig
+mindketten
+minek
+minket
+mint
+mintha
+minél
+mire
+miről
+mit
+mitől
+mivel
+mivé
+miért
+mondta
+most
+mostanáig
+már
+más
+másik
+másikat
+másnap
+második
+másodszor
+mások
+másokat
+mást
+még
+mégis
+míg
+mögé
+mögéd
+mögéjük
+mögém
+mögénk
+mögétek
+mögött
+mögötte
+mögötted
+mögöttem
+mögöttetek
+mögöttük
+mögöttünk
+mögül
+mögüle
+mögüled
+mögülem
+mögületek
+mögülük
+mögülünk
+múltkor
+múlva
+na
+nagy
+nagyobb
+nagyon
+naponta
+napot
+ne
+negyedik
+negyediket
+negyven
+neked
+nekem
+neki
+nekik
+nektek
+nekünk
+nem
+nemcsak
+nemrég
+nincs
+nyolc
+nyolcadik
+nyolcadikat
+nyolcat
+nyolcvan
+nála
+nálad
+nálam
+nálatok
+náluk
+nálunk
+négy
+négyet
+néha
+néhány
+nélkül
+o
+oda
+ok
+olyan
+onnan
+ott
+pedig
+persze
+pár
+például
+rajta
+rajtad
+rajtam
+rajtatok
+rajtuk
+rajtunk
+rendben
+rosszul
+rá
+rád
+rájuk
+rám
+ránk
+rátok
+régen
+régóta
+részére
+róla
+rólad
+rólam
+rólatok
+róluk
+rólunk
+rögtön
+s
+saját
+se
+sem
+semmi
+semmilyen
+semmiség
+senki
+soha
+sok
+sokan
+sokat
+sokkal
+sokszor
+sokáig
+során
+stb.
+szemben
+szerbusz
+szerint
+szerinte
+szerinted
+szerintem
+szerintetek
+szerintük
+szerintünk
+szervusz
+szinte
+számára
+száz
+századik
+százat
+szépen
+szét
+szíves
+szívesen
+szíveskedjék
+sőt
+talán
+tavaly
+te
+tegnap
+tegnapelőtt
+tehát
+tele
+teljes
+tessék
+ti
+tied
+titeket
+tizedik
+tizediket
+tizenegy
+tizenegyedik
+tizenhat
+tizenhárom
+tizenhét
+tizenkettedik
+tizenkettő
+tizenkilenc
+tizenkét
+tizennyolc
+tizennégy
+tizenöt
+tizet
+tovább
+további
+továbbá
+távol
+téged
+tényleg
+tíz
+több
+többi
+többször
+túl
+tőle
+tőled
+tőlem
+tőletek
+tőlük
+tőlünk
+ugyanakkor
+ugyanez
+ugyanis
+ugye
+urak
+uram
+urat
+utoljára
+utolsó
+után
+utána
+vagy
+vagyis
+vagyok
+vagytok
+vagyunk
+vajon
+valahol
+valaki
+valakit
+valamelyik
+valami
+valamint
+való
+van
+vannak
+vele
+veled
+velem
+veletek
+velük
+velünk
+vissza
+viszlát
+viszont
+viszontlátásra
+volna
+volnának
+volnék
+volt
+voltak
+voltam
+voltunk
+végre
+végén
+végül
+által
+általában
+ám
+át
+éljen
+én
+éppen
+érte
+érted
+értem
+értetek
+értük
+értünk
+és
+év
+évben
+éve
+évek
+éves
+évi
+évvel
+így
+óta
+õ
+õk
+õket
+ön
+önbe
+önben
+önből
+önhöz
+önnek
+önnel
+önnél
+önre
+önről
+önt
+öntől
+önért
+önök
+önökbe
+önökben
+önökből
+önöket
+önökhöz
+önökkel
+önöknek
+önöknél
+önökre
+önökről
+önöktől
+önökért
+önökön
+önön
+össze
+öt
+ötven
+ötödik
+ötödiket
+ötöt
+úgy
+úgyis
+úgynevezett
+új
+újabb
+újra
+úr
+ő
+ők
+őket
+őt
\ No newline at end of file
--- /dev/null
+այդ
+այլ
+այն
+այս
+դու
+դուք
+եմ
+են
+ենք
+ես
+եք
+է
+էի
+էին
+էինք
+էիր
+էիք
+էր
+ըստ
+թ
+ի
+ին
+իսկ
+իր
+կամ
+համար
+հետ
+հետո
+մենք
+մեջ
+մի
+ն
+նա
+նաև
+նրա
+նրանք
+որ
+որը
+որոնք
+որպես
+ու
+ում
+պիտի
+վրա
+և
\ No newline at end of file
--- /dev/null
+ada
+adalah
+adanya
+adapun
+agak
+agaknya
+agar
+akan
+akankah
+akhir
+akhiri
+akhirnya
+aku
+akulah
+amat
+amatlah
+anda
+andalah
+antar
+antara
+antaranya
+apa
+apaan
+apabila
+apakah
+apalagi
+apatah
+artinya
+asal
+asalkan
+atas
+atau
+ataukah
+ataupun
+awal
+awalnya
+bagai
+bagaikan
+bagaimana
+bagaimanakah
+bagaimanapun
+bagi
+bagian
+bahkan
+bahwa
+bahwasanya
+baik
+bakal
+bakalan
+balik
+banyak
+bapak
+baru
+bawah
+beberapa
+begini
+beginian
+beginikah
+beginilah
+begitu
+begitukah
+begitulah
+begitupun
+bekerja
+belakang
+belakangan
+belum
+belumlah
+benar
+benarkah
+benarlah
+berada
+berakhir
+berakhirlah
+berakhirnya
+berapa
+berapakah
+berapalah
+berapapun
+berarti
+berawal
+berbagai
+berdatangan
+beri
+berikan
+berikut
+berikutnya
+berjumlah
+berkali-kali
+berkata
+berkehendak
+berkeinginan
+berkenaan
+berlainan
+berlalu
+berlangsung
+berlebihan
+bermacam
+bermacam-macam
+bermaksud
+bermula
+bersama
+bersama-sama
+bersiap
+bersiap-siap
+bertanya
+bertanya-tanya
+berturut
+berturut-turut
+bertutur
+berujar
+berupa
+besar
+betul
+betulkah
+biasa
+biasanya
+bila
+bilakah
+bisa
+bisakah
+boleh
+bolehkah
+bolehlah
+buat
+bukan
+bukankah
+bukanlah
+bukannya
+bulan
+bung
+cara
+caranya
+cukup
+cukupkah
+cukuplah
+cuma
+dahulu
+dalam
+dan
+dapat
+dari
+daripada
+datang
+dekat
+demi
+demikian
+demikianlah
+dengan
+depan
+di
+dia
+diakhiri
+diakhirinya
+dialah
+diantara
+diantaranya
+diberi
+diberikan
+diberikannya
+dibuat
+dibuatnya
+didapat
+didatangkan
+digunakan
+diibaratkan
+diibaratkannya
+diingat
+diingatkan
+diinginkan
+dijawab
+dijelaskan
+dijelaskannya
+dikarenakan
+dikatakan
+dikatakannya
+dikerjakan
+diketahui
+diketahuinya
+dikira
+dilakukan
+dilalui
+dilihat
+dimaksud
+dimaksudkan
+dimaksudkannya
+dimaksudnya
+diminta
+dimintai
+dimisalkan
+dimulai
+dimulailah
+dimulainya
+dimungkinkan
+dini
+dipastikan
+diperbuat
+diperbuatnya
+dipergunakan
+diperkirakan
+diperlihatkan
+diperlukan
+diperlukannya
+dipersoalkan
+dipertanyakan
+dipunyai
+diri
+dirinya
+disampaikan
+disebut
+disebutkan
+disebutkannya
+disini
+disinilah
+ditambahkan
+ditandaskan
+ditanya
+ditanyai
+ditanyakan
+ditegaskan
+ditujukan
+ditunjuk
+ditunjuki
+ditunjukkan
+ditunjukkannya
+ditunjuknya
+dituturkan
+dituturkannya
+diucapkan
+diucapkannya
+diungkapkan
+dong
+dua
+dulu
+empat
+enggak
+enggaknya
+entah
+entahlah
+guna
+gunakan
+hal
+hampir
+hanya
+hanyalah
+hari
+harus
+haruslah
+harusnya
+hendak
+hendaklah
+hendaknya
+hingga
+ia
+ialah
+ibarat
+ibaratkan
+ibaratnya
+ibu
+ikut
+ingat
+ingat-ingat
+ingin
+inginkah
+inginkan
+ini
+inikah
+inilah
+itu
+itukah
+itulah
+jadi
+jadilah
+jadinya
+jangan
+jangankan
+janganlah
+jauh
+jawab
+jawaban
+jawabnya
+jelas
+jelaskan
+jelaslah
+jelasnya
+jika
+jikalau
+juga
+jumlah
+jumlahnya
+justru
+kala
+kalau
+kalaulah
+kalaupun
+kalian
+kami
+kamilah
+kamu
+kamulah
+kan
+kapan
+kapankah
+kapanpun
+karena
+karenanya
+kasus
+kata
+katakan
+katakanlah
+katanya
+ke
+keadaan
+kebetulan
+kecil
+kedua
+keduanya
+keinginan
+kelamaan
+kelihatan
+kelihatannya
+kelima
+keluar
+kembali
+kemudian
+kemungkinan
+kemungkinannya
+kenapa
+kepada
+kepadanya
+kesampaian
+keseluruhan
+keseluruhannya
+keterlaluan
+ketika
+khususnya
+kini
+kinilah
+kira
+kira-kira
+kiranya
+kita
+kitalah
+kok
+kurang
+lagi
+lagian
+lah
+lain
+lainnya
+lalu
+lama
+lamanya
+lanjut
+lanjutnya
+lebih
+lewat
+lima
+luar
+macam
+maka
+makanya
+makin
+malah
+malahan
+mampu
+mampukah
+mana
+manakala
+manalagi
+masa
+masalah
+masalahnya
+masih
+masihkah
+masing
+masing-masing
+mau
+maupun
+melainkan
+melakukan
+melalui
+melihat
+melihatnya
+memang
+memastikan
+memberi
+memberikan
+membuat
+memerlukan
+memihak
+meminta
+memintakan
+memisalkan
+memperbuat
+mempergunakan
+memperkirakan
+memperlihatkan
+mempersiapkan
+mempersoalkan
+mempertanyakan
+mempunyai
+memulai
+memungkinkan
+menaiki
+menambahkan
+menandaskan
+menanti
+menanti-nanti
+menantikan
+menanya
+menanyai
+menanyakan
+mendapat
+mendapatkan
+mendatang
+mendatangi
+mendatangkan
+menegaskan
+mengakhiri
+mengapa
+mengatakan
+mengatakannya
+mengenai
+mengerjakan
+mengetahui
+menggunakan
+menghendaki
+mengibaratkan
+mengibaratkannya
+mengingat
+mengingatkan
+menginginkan
+mengira
+mengucapkan
+mengucapkannya
+mengungkapkan
+menjadi
+menjawab
+menjelaskan
+menuju
+menunjuk
+menunjuki
+menunjukkan
+menunjuknya
+menurut
+menuturkan
+menyampaikan
+menyangkut
+menyatakan
+menyebutkan
+menyeluruh
+menyiapkan
+merasa
+mereka
+merekalah
+merupakan
+meski
+meskipun
+meyakini
+meyakinkan
+minta
+mirip
+misal
+misalkan
+misalnya
+mula
+mulai
+mulailah
+mulanya
+mungkin
+mungkinkah
+nah
+naik
+namun
+nanti
+nantinya
+nyaris
+nyatanya
+oleh
+olehnya
+pada
+padahal
+padanya
+pak
+paling
+panjang
+pantas
+para
+pasti
+pastilah
+penting
+pentingnya
+per
+percuma
+perlu
+perlukah
+perlunya
+pernah
+persoalan
+pertama
+pertama-tama
+pertanyaan
+pertanyakan
+pihak
+pihaknya
+pukul
+pula
+pun
+punya
+rasa
+rasanya
+rata
+rupanya
+saat
+saatnya
+saja
+sajalah
+saling
+sama
+sama-sama
+sambil
+sampai
+sampai-sampai
+sampaikan
+sana
+sangat
+sangatlah
+satu
+saya
+sayalah
+se
+sebab
+sebabnya
+sebagai
+sebagaimana
+sebagainya
+sebagian
+sebaik
+sebaik-baiknya
+sebaiknya
+sebaliknya
+sebanyak
+sebegini
+sebegitu
+sebelum
+sebelumnya
+sebenarnya
+seberapa
+sebesar
+sebetulnya
+sebisanya
+sebuah
+sebut
+sebutlah
+sebutnya
+secara
+secukupnya
+sedang
+sedangkan
+sedemikian
+sedikit
+sedikitnya
+seenaknya
+segala
+segalanya
+segera
+seharusnya
+sehingga
+seingat
+sejak
+sejauh
+sejenak
+sejumlah
+sekadar
+sekadarnya
+sekali
+sekali-kali
+sekalian
+sekaligus
+sekalipun
+sekarang
+sekecil
+seketika
+sekiranya
+sekitar
+sekitarnya
+sekurang-kurangnya
+sekurangnya
+sela
+selagi
+selain
+selaku
+selalu
+selama
+selama-lamanya
+selamanya
+selanjutnya
+seluruh
+seluruhnya
+semacam
+semakin
+semampu
+semampunya
+semasa
+semasih
+semata
+semata-mata
+semaunya
+sementara
+semisal
+semisalnya
+sempat
+semua
+semuanya
+semula
+sendiri
+sendirian
+sendirinya
+seolah
+seolah-olah
+seorang
+sepanjang
+sepantasnya
+sepantasnyalah
+seperlunya
+seperti
+sepertinya
+sepihak
+sering
+seringnya
+serta
+serupa
+sesaat
+sesama
+sesampai
+sesegera
+sesekali
+seseorang
+sesuatu
+sesuatunya
+sesudah
+sesudahnya
+setelah
+setempat
+setengah
+seterusnya
+setiap
+setiba
+setibanya
+setidak-tidaknya
+setidaknya
+setinggi
+seusai
+sewaktu
+siap
+siapa
+siapakah
+siapapun
+sini
+sinilah
+soal
+soalnya
+suatu
+sudah
+sudahkah
+sudahlah
+supaya
+tadi
+tadinya
+tahu
+tahun
+tak
+tambah
+tambahnya
+tampak
+tampaknya
+tandas
+tandasnya
+tanpa
+tanya
+tanyakan
+tanyanya
+tapi
+tegas
+tegasnya
+telah
+tempat
+tengah
+tentang
+tentu
+tentulah
+tentunya
+tepat
+terakhir
+terasa
+terbanyak
+terdahulu
+terdapat
+terdiri
+terhadap
+terhadapnya
+teringat
+teringat-ingat
+terjadi
+terjadilah
+terjadinya
+terkira
+terlalu
+terlebih
+terlihat
+termasuk
+ternyata
+tersampaikan
+tersebut
+tersebutlah
+tertentu
+tertuju
+terus
+terutama
+tetap
+tetapi
+tiap
+tiba
+tiba-tiba
+tidak
+tidakkah
+tidaklah
+tiga
+tinggi
+toh
+tunjuk
+turut
+tutur
+tuturnya
+ucap
+ucapnya
+ujar
+ujarnya
+umum
+umumnya
+ungkap
+ungkapnya
+untuk
+usah
+usai
+waduh
+wah
+wahai
+waktu
+waktunya
+walau
+walaupun
+wong
+yaitu
+yakin
+yakni
+yang
\ No newline at end of file
--- /dev/null
+a
+abbastanza
+abbia
+abbiamo
+abbiano
+abbiate
+accidenti
+ad
+adesso
+affinché
+agl
+agli
+ahime
+ahimè
+ai
+al
+alcuna
+alcuni
+alcuno
+all
+alla
+alle
+allo
+allora
+altre
+altri
+altrimenti
+altro
+altrove
+altrui
+anche
+ancora
+anni
+anno
+ansa
+anticipo
+assai
+attesa
+attraverso
+avanti
+avemmo
+avendo
+avente
+aver
+avere
+averlo
+avesse
+avessero
+avessi
+avessimo
+aveste
+avesti
+avete
+aveva
+avevamo
+avevano
+avevate
+avevi
+avevo
+avrai
+avranno
+avrebbe
+avrebbero
+avrei
+avremmo
+avremo
+avreste
+avresti
+avrete
+avrà
+avrò
+avuta
+avute
+avuti
+avuto
+basta
+ben
+bene
+benissimo
+brava
+bravo
+buono
+c
+caso
+cento
+certa
+certe
+certi
+certo
+che
+chi
+chicchessia
+chiunque
+ci
+ciascuna
+ciascuno
+cima
+cinque
+cio
+cioe
+cioè
+circa
+citta
+città
+ciò
+co
+codesta
+codesti
+codesto
+cogli
+coi
+col
+colei
+coll
+coloro
+colui
+come
+cominci
+comprare
+comunque
+con
+concernente
+conclusione
+consecutivi
+consecutivo
+consiglio
+contro
+cortesia
+cos
+cosa
+cosi
+così
+cui
+d
+da
+dagl
+dagli
+dai
+dal
+dall
+dalla
+dalle
+dallo
+dappertutto
+davanti
+degl
+degli
+dei
+del
+dell
+della
+delle
+dello
+dentro
+detto
+deve
+devo
+di
+dice
+dietro
+dire
+dirimpetto
+diventa
+diventare
+diventato
+dopo
+doppio
+dov
+dove
+dovra
+dovrà
+dovunque
+due
+dunque
+durante
+e
+ebbe
+ebbero
+ebbi
+ecc
+ecco
+ed
+effettivamente
+egli
+ella
+entrambi
+eppure
+era
+erano
+eravamo
+eravate
+eri
+ero
+esempio
+esse
+essendo
+esser
+essere
+essi
+ex
+fa
+faccia
+facciamo
+facciano
+facciate
+faccio
+facemmo
+facendo
+facesse
+facessero
+facessi
+facessimo
+faceste
+facesti
+faceva
+facevamo
+facevano
+facevate
+facevi
+facevo
+fai
+fanno
+farai
+faranno
+fare
+farebbe
+farebbero
+farei
+faremmo
+faremo
+fareste
+faresti
+farete
+farà
+farò
+fatto
+favore
+fece
+fecero
+feci
+fin
+finalmente
+finche
+fine
+fino
+forse
+forza
+fosse
+fossero
+fossi
+fossimo
+foste
+fosti
+fra
+frattempo
+fu
+fui
+fummo
+fuori
+furono
+futuro
+generale
+gente
+gia
+giacche
+giorni
+giorno
+giu
+già
+gli
+gliela
+gliele
+glieli
+glielo
+gliene
+grande
+grazie
+gruppo
+ha
+haha
+hai
+hanno
+ho
+i
+ie
+ieri
+il
+improvviso
+in
+inc
+indietro
+infatti
+inoltre
+insieme
+intanto
+intorno
+invece
+io
+l
+la
+lasciato
+lato
+le
+lei
+li
+lo
+lontano
+loro
+lui
+lungo
+luogo
+là
+ma
+macche
+magari
+maggior
+mai
+male
+malgrado
+malissimo
+me
+medesimo
+mediante
+meglio
+meno
+mentre
+mesi
+mezzo
+mi
+mia
+mie
+miei
+mila
+miliardi
+milioni
+minimi
+mio
+modo
+molta
+molti
+moltissimo
+molto
+momento
+mondo
+ne
+negl
+negli
+nei
+nel
+nell
+nella
+nelle
+nello
+nemmeno
+neppure
+nessun
+nessuna
+nessuno
+niente
+no
+noi
+nome
+non
+nondimeno
+nonostante
+nonsia
+nostra
+nostre
+nostri
+nostro
+novanta
+nove
+nulla
+nuovi
+nuovo
+o
+od
+oggi
+ogni
+ognuna
+ognuno
+oltre
+oppure
+ora
+ore
+osi
+ossia
+ottanta
+otto
+paese
+parecchi
+parecchie
+parecchio
+parte
+partendo
+peccato
+peggio
+per
+perche
+perchè
+perché
+percio
+perciò
+perfino
+pero
+persino
+persone
+però
+piedi
+pieno
+piglia
+piu
+piuttosto
+più
+po
+pochissimo
+poco
+poi
+poiche
+possa
+possedere
+posteriore
+posto
+potrebbe
+preferibilmente
+presa
+press
+prima
+primo
+principalmente
+probabilmente
+promesso
+proprio
+puo
+pure
+purtroppo
+può
+qua
+qualche
+qualcosa
+qualcuna
+qualcuno
+quale
+quali
+qualunque
+quando
+quanta
+quante
+quanti
+quanto
+quantunque
+quarto
+quasi
+quattro
+quel
+quella
+quelle
+quelli
+quello
+quest
+questa
+queste
+questi
+questo
+qui
+quindi
+quinto
+realmente
+recente
+recentemente
+registrazione
+relativo
+riecco
+rispetto
+salvo
+sara
+sarai
+saranno
+sarebbe
+sarebbero
+sarei
+saremmo
+saremo
+sareste
+saresti
+sarete
+sarà
+sarò
+scola
+scopo
+scorso
+se
+secondo
+seguente
+seguito
+sei
+sembra
+sembrare
+sembrato
+sembrava
+sembri
+sempre
+senza
+sette
+si
+sia
+siamo
+siano
+siate
+siete
+sig
+solito
+solo
+soltanto
+sono
+sopra
+soprattutto
+sotto
+spesso
+sta
+stai
+stando
+stanno
+starai
+staranno
+starebbe
+starebbero
+starei
+staremmo
+staremo
+stareste
+staresti
+starete
+starà
+starò
+stata
+state
+stati
+stato
+stava
+stavamo
+stavano
+stavate
+stavi
+stavo
+stemmo
+stessa
+stesse
+stessero
+stessi
+stessimo
+stesso
+steste
+stesti
+stette
+stettero
+stetti
+stia
+stiamo
+stiano
+stiate
+sto
+su
+sua
+subito
+successivamente
+successivo
+sue
+sugl
+sugli
+sui
+sul
+sull
+sulla
+sulle
+sullo
+suo
+suoi
+tale
+tali
+talvolta
+tanto
+te
+tempo
+terzo
+th
+ti
+titolo
+tra
+tranne
+tre
+trenta
+triplo
+troppo
+trovato
+tu
+tua
+tue
+tuo
+tuoi
+tutta
+tuttavia
+tutte
+tutti
+tutto
+uguali
+ulteriore
+ultimo
+un
+una
+uno
+uomo
+va
+vai
+vale
+vari
+varia
+varie
+vario
+verso
+vi
+vicino
+visto
+vita
+voi
+volta
+volte
+vostra
+vostre
+vostri
+vostro
+è
\ No newline at end of file
--- /dev/null
+あそこ
+あっ
+あの
+あのかた
+あの人
+あり
+あります
+ある
+あれ
+い
+いう
+います
+いる
+う
+うち
+え
+お
+および
+おり
+おります
+か
+かつて
+から
+が
+き
+ここ
+こちら
+こと
+この
+これ
+これら
+さ
+さらに
+し
+しかし
+する
+ず
+せ
+せる
+そこ
+そして
+その
+その他
+その後
+それ
+それぞれ
+それで
+た
+ただし
+たち
+ため
+たり
+だ
+だっ
+だれ
+つ
+て
+で
+でき
+できる
+です
+では
+でも
+と
+という
+といった
+とき
+ところ
+として
+とともに
+とも
+と共に
+どこ
+どの
+な
+ない
+なお
+なかっ
+ながら
+なく
+なっ
+など
+なに
+なら
+なり
+なる
+なん
+に
+において
+における
+について
+にて
+によって
+により
+による
+に対して
+に対する
+に関する
+の
+ので
+のみ
+は
+ば
+へ
+ほか
+ほとんど
+ほど
+ます
+また
+または
+まで
+も
+もの
+ものの
+や
+よう
+より
+ら
+られ
+られる
+れ
+れる
+を
+ん
+何
+及び
+彼
+彼女
+我々
+特に
+私
+私達
+貴方
+貴方方
\ No newline at end of file
--- /dev/null
+!
+"
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+;
+<
+=
+>
+?
+@
+\
+^
+_
+`
+|
+~
+·
+—
+——
+‘
+’
+“
+”
+…
+、
+。
+〈
+〉
+《
+》
+가
+가까스로
+가령
+각
+각각
+각자
+각종
+갖고말하자면
+같다
+같이
+개의치않고
+거니와
+거바
+거의
+것
+것과 같이
+것들
+게다가
+게우다
+겨우
+견지에서
+결과에 이르다
+결국
+결론을 낼 수 있다
+겸사겸사
+고려하면
+고로
+곧
+공동으로
+과
+과연
+관계가 있다
+관계없이
+관련이 있다
+관하여
+관한
+관해서는
+구
+구체적으로
+구토하다
+그
+그들
+그때
+그래
+그래도
+그래서
+그러나
+그러니
+그러니까
+그러면
+그러므로
+그러한즉
+그런 까닭에
+그런데
+그런즉
+그럼
+그럼에도 불구하고
+그렇게 함으로써
+그렇지
+그렇지 않다면
+그렇지 않으면
+그렇지만
+그렇지않으면
+그리고
+그리하여
+그만이다
+그에 따르는
+그위에
+그저
+그중에서
+그치지 않다
+근거로
+근거하여
+기대여
+기점으로
+기준으로
+기타
+까닭으로
+까악
+까지
+까지 미치다
+까지도
+꽈당
+끙끙
+끼익
+나
+나머지는
+남들
+남짓
+너
+너희
+너희들
+네
+넷
+년
+논하지 않다
+놀라다
+누가 알겠는가
+누구
+다른
+다른 방면으로
+다만
+다섯
+다소
+다수
+다시 말하자면
+다시말하면
+다음
+다음에
+다음으로
+단지
+답다
+당신
+당장
+대로 하다
+대하면
+대하여
+대해 말하자면
+대해서
+댕그
+더구나
+더군다나
+더라도
+더불어
+더욱더
+더욱이는
+도달하다
+도착하다
+동시에
+동안
+된바에야
+된이상
+두번째로
+둘
+둥둥
+뒤따라
+뒤이어
+든간에
+들
+등
+등등
+딩동
+따라
+따라서
+따위
+따지지 않다
+딱
+때
+때가 되어
+때문에
+또
+또한
+뚝뚝
+라 해도
+령
+로
+로 인하여
+로부터
+로써
+륙
+를
+마음대로
+마저
+마저도
+마치
+막론하고
+만 못하다
+만약
+만약에
+만은 아니다
+만이 아니다
+만일
+만큼
+말하자면
+말할것도 없고
+매
+매번
+메쓰겁다
+몇
+모
+모두
+무렵
+무릎쓰고
+무슨
+무엇
+무엇때문에
+물론
+및
+바꾸어말하면
+바꾸어말하자면
+바꾸어서 말하면
+바꾸어서 한다면
+바꿔 말하면
+바로
+바와같이
+밖에 안된다
+반대로
+반대로 말하자면
+반드시
+버금
+보는데서
+보다더
+보드득
+본대로
+봐
+봐라
+부류의 사람들
+부터
+불구하고
+불문하고
+붕붕
+비걱거리다
+비교적
+비길수 없다
+비로소
+비록
+비슷하다
+비추어 보아
+비하면
+뿐만 아니라
+뿐만아니라
+뿐이다
+삐걱
+삐걱거리다
+사
+삼
+상대적으로 말하자면
+생각한대로
+설령
+설마
+설사
+셋
+소생
+소인
+솨
+쉿
+습니까
+습니다
+시각
+시간
+시작하여
+시초에
+시키다
+실로
+심지어
+아
+아니
+아니나다를가
+아니라면
+아니면
+아니었다면
+아래윗
+아무거나
+아무도
+아야
+아울러
+아이
+아이고
+아이구
+아이야
+아이쿠
+아하
+아홉
+안 그러면
+않기 위하여
+않기 위해서
+알 수 있다
+알았어
+앗
+앞에서
+앞의것
+야
+약간
+양자
+어
+어기여차
+어느
+어느 년도
+어느것
+어느곳
+어느때
+어느쪽
+어느해
+어디
+어때
+어떠한
+어떤
+어떤것
+어떤것들
+어떻게
+어떻해
+어이
+어째서
+어쨋든
+어쩔수 없다
+어찌
+어찌됏든
+어찌됏어
+어찌하든지
+어찌하여
+언제
+언젠가
+얼마
+얼마 안 되는 것
+얼마간
+얼마나
+얼마든지
+얼마만큼
+얼마큼
+엉엉
+에
+에 가서
+에 달려 있다
+에 대해
+에 있다
+에 한하다
+에게
+에서
+여
+여기
+여덟
+여러분
+여보시오
+여부
+여섯
+여전히
+여차
+연관되다
+연이서
+영
+영차
+옆사람
+예
+예를 들면
+예를 들자면
+예컨대
+예하면
+오
+오로지
+오르다
+오자마자
+오직
+오호
+오히려
+와
+와 같은 사람들
+와르르
+와아
+왜
+왜냐하면
+외에도
+요만큼
+요만한 것
+요만한걸
+요컨대
+우르르
+우리
+우리들
+우선
+우에 종합한것과같이
+운운
+월
+위에서 서술한바와같이
+위하여
+위해서
+윙윙
+육
+으로
+으로 인하여
+으로서
+으로써
+을
+응
+응당
+의
+의거하여
+의지하여
+의해
+의해되다
+의해서
+이
+이 되다
+이 때문에
+이 밖에
+이 외에
+이 정도의
+이것
+이곳
+이때
+이라면
+이래
+이러이러하다
+이러한
+이런
+이럴정도로
+이렇게 많은 것
+이렇게되면
+이렇게말하자면
+이렇구나
+이로 인하여
+이르기까지
+이리하여
+이만큼
+이번
+이봐
+이상
+이어서
+이었다
+이와 같다
+이와 같은
+이와 반대로
+이와같다면
+이외에도
+이용하여
+이유만으로
+이젠
+이지만
+이쪽
+이천구
+이천육
+이천칠
+이천팔
+인 듯하다
+인젠
+일
+일것이다
+일곱
+일단
+일때
+일반적으로
+일지라도
+임에 틀림없다
+입각하여
+입장에서
+잇따라
+있다
+자
+자기
+자기집
+자마자
+자신
+잠깐
+잠시
+저
+저것
+저것만큼
+저기
+저쪽
+저희
+전부
+전자
+전후
+점에서 보아
+정도에 이르다
+제
+제각기
+제외하고
+조금
+조차
+조차도
+졸졸
+좀
+좋아
+좍좍
+주룩주룩
+주저하지 않고
+줄은 몰랏다
+줄은모른다
+중에서
+중의하나
+즈음하여
+즉
+즉시
+지든지
+지만
+지말고
+진짜로
+쪽으로
+차라리
+참
+참나
+첫번째로
+쳇
+총적으로
+총적으로 말하면
+총적으로 보면
+칠
+콸콸
+쾅쾅
+쿵
+타다
+타인
+탕탕
+토하다
+통하여
+툭
+퉤
+틈타
+팍
+팔
+퍽
+펄렁
+하
+하게될것이다
+하게하다
+하겠는가
+하고 있다
+하고있었다
+하곤하였다
+하구나
+하기 때문에
+하기 위하여
+하기는한데
+하기만 하면
+하기보다는
+하기에
+하나
+하느니
+하는 김에
+하는 편이 낫다
+하는것도
+하는것만 못하다
+하는것이 낫다
+하는바
+하더라도
+하도다
+하도록시키다
+하도록하다
+하든지
+하려고하다
+하마터면
+하면 할수록
+하면된다
+하면서
+하물며
+하여금
+하여야
+하자마자
+하지 않는다면
+하지 않도록
+하지마
+하지마라
+하지만
+하하
+한 까닭에
+한 이유는
+한 후
+한다면
+한다면 몰라도
+한데
+한마디
+한적이있다
+한켠으로는
+한항목
+할 따름이다
+할 생각이다
+할 줄 안다
+할 지경이다
+할 힘이 있다
+할때
+할만하다
+할망정
+할뿐
+할수있다
+할수있어
+할줄알다
+할지라도
+할지언정
+함께
+해도된다
+해도좋다
+해봐요
+해서는 안된다
+해야한다
+해요
+했어요
+향하다
+향하여
+향해서
+허
+허걱
+허허
+헉
+헉헉
+헐떡헐떡
+형식으로 쓰여
+혹시
+혹은
+혼자
+훨씬
+휘익
+휴
+흐흐
+흥
+힘입어
+︿
+!
+#
+$
+%
+&
+(
+)
+*
++
+,
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+>
+?
+@
+[
+]
+{
+|
+}
+~
+¥
\ No newline at end of file
--- /dev/null
+ئێمە
+ئێوە
+ئەم
+ئەو
+ئەوان
+ئەوەی
+بۆ
+بێ
+بێجگە
+بە
+بەبێ
+بەدەم
+بەردەم
+بەرلە
+بەرەوی
+بەرەوە
+بەلای
+بەپێی
+تۆ
+تێ
+جگە
+دوای
+دوو
+دە
+دەکات
+دەگەڵ
+سەر
+لێ
+لە
+لەبابەت
+لەباتی
+لەبارەی
+لەبرێتی
+لەبن
+لەبەر
+لەبەینی
+لەدەم
+لەرێ
+لەرێگا
+لەرەوی
+لەسەر
+لەلایەن
+لەناو
+لەنێو
+لەو
+لەپێناوی
+لەژێر
+لەگەڵ
+من
+ناو
+نێوان
+هەر
+هەروەها
+و
+وەک
+پاش
+پێ
+پێش
+چەند
+کرد
+کە
+ی
\ No newline at end of file
--- /dev/null
+a
+ab
+ac
+ad
+at
+atque
+aut
+autem
+cum
+de
+dum
+e
+erant
+erat
+est
+et
+etiam
+ex
+haec
+hic
+hoc
+in
+ita
+me
+nec
+neque
+non
+per
+qua
+quae
+quam
+qui
+quibus
+quidem
+quo
+quod
+re
+rebus
+rem
+res
+sed
+si
+sic
+sunt
+tamen
+tandem
+te
+ut
+vel
\ No newline at end of file
--- /dev/null
+abi
+abidvi
+abiejose
+abiejuose
+abiejø
+abiem
+abigaliai
+abipus
+abu
+abudu
+ai
+ana
+anaiptol
+anaisiais
+anajai
+anajam
+anajame
+anapus
+anas
+anasai
+anasis
+anei
+aniedvi
+anieji
+aniesiems
+anoji
+anojo
+anojoje
+anokia
+anoks
+anosiomis
+anosioms
+anosios
+anosiose
+anot
+ant
+antai
+anuodu
+anuoju
+anuosiuose
+anuosius
+anàja
+anàjà
+anàjá
+anàsias
+anøjø
+apie
+aplink
+ar
+arba
+argi
+arti
+aukðèiau
+að
+be
+bei
+beje
+bemaþ
+bent
+bet
+betgi
+beveik
+dar
+dargi
+daugmaþ
+deja
+dëka
+dël
+dëlei
+dëlto
+ech
+et
+gal
+galbût
+galgi
+gan
+gana
+gi
+greta
+idant
+iki
+ir
+irgi
+it
+itin
+ið
+iðilgai
+iðvis
+jaisiais
+jajai
+jajam
+jajame
+jei
+jeigu
+ji
+jiedu
+jiedvi
+jieji
+jiesiems
+jinai
+jis
+jisai
+jog
+joji
+jojo
+jojoje
+jokia
+joks
+josiomis
+josioms
+josios
+josiose
+judu
+judvi
+juk
+jumis
+jums
+jumyse
+juodu
+juoju
+juosiuose
+juosius
+jus
+jàja
+jàjà
+jàsias
+jájá
+jøjø
+jûs
+jûsiðkis
+jûsiðkë
+jûsø
+kad
+kada
+kadangi
+kai
+kaip
+kaipgi
+kas
+katra
+katras
+katriedvi
+katruodu
+kaþin
+kaþkas
+kaþkatra
+kaþkatras
+kaþkokia
+kaþkoks
+kaþkuri
+kaþkuris
+kiaurai
+kiek
+kiekvienas
+kieno
+kita
+kitas
+kitokia
+kitoks
+kodël
+kokia
+koks
+kol
+kolei
+kone
+kuomet
+kur
+kurgi
+kuri
+kuriedvi
+kuris
+kuriuodu
+lai
+lig
+ligi
+link
+lyg
+man
+manaisiais
+manajai
+manajam
+manajame
+manas
+manasai
+manasis
+mane
+manieji
+maniesiems
+manim
+manimi
+maniðkis
+maniðkë
+mano
+manoji
+manojo
+manojoje
+manosiomis
+manosioms
+manosios
+manosiose
+manuoju
+manuosiuose
+manuosius
+manyje
+manàja
+manàjà
+manàjá
+manàsias
+manæs
+manøjø
+mat
+maþdaug
+maþne
+mes
+mudu
+mudvi
+mumis
+mums
+mumyse
+mus
+mûsiðkis
+mûsiðkë
+mûsø
+na
+nagi
+ne
+nebe
+nebent
+negi
+negu
+nei
+nejau
+nejaugi
+nekaip
+nelyginant
+nes
+net
+netgi
+netoli
+neva
+nors
+nuo
+në
+o
+ogi
+oi
+paeiliui
+pagal
+pakeliui
+palaipsniui
+palei
+pas
+pasak
+paskos
+paskui
+paskum
+pat
+pati
+patiems
+paties
+pats
+patys
+patá
+paèiais
+paèiam
+paèiame
+paèiu
+paèiuose
+paèius
+paèiø
+per
+pernelyg
+pirm
+pirma
+pirmiau
+po
+prie
+prieð
+prieðais
+pro
+pusiau
+rasi
+rodos
+sau
+savaisiais
+savajai
+savajam
+savajame
+savas
+savasai
+savasis
+save
+savieji
+saviesiems
+savimi
+saviðkis
+saviðkë
+savo
+savoji
+savojo
+savojoje
+savosiomis
+savosioms
+savosios
+savosiose
+savuoju
+savuosiuose
+savuosius
+savyje
+savàja
+savàjà
+savàjá
+savàsias
+savæs
+savøjø
+skersai
+skradþiai
+staèiai
+su
+sulig
+ta
+tad
+tai
+taigi
+taip
+taipogi
+taisiais
+tajai
+tajam
+tajame
+tamsta
+tarp
+tarsi
+tartum
+tarytum
+tas
+tasai
+tau
+tavaisiais
+tavajai
+tavajam
+tavajame
+tavas
+tavasai
+tavasis
+tave
+tavieji
+taviesiems
+tavimi
+taviðkis
+taviðkë
+tavo
+tavoji
+tavojo
+tavojoje
+tavosiomis
+tavosioms
+tavosios
+tavosiose
+tavuoju
+tavuosiuose
+tavuosius
+tavyje
+tavàja
+tavàjà
+tavàjá
+tavàsias
+tavæs
+tavøjø
+taèiau
+te
+tegu
+tegul
+tiedvi
+tieji
+ties
+tiesiems
+tiesiog
+tik
+tikriausiai
+tiktai
+toji
+tojo
+tojoje
+tokia
+toks
+tol
+tolei
+toliau
+tosiomis
+tosioms
+tosios
+tosiose
+tu
+tuodu
+tuoju
+tuosiuose
+tuosius
+turbût
+tàja
+tàjà
+tàjá
+tàsias
+tøjø
+tûlas
+uþ
+uþtat
+uþvis
+va
+vai
+viduj
+vidury
+vien
+vienas
+vienokia
+vienoks
+vietoj
+virð
+virðuj
+virðum
+vis
+vis dëlto
+visa
+visas
+visgi
+visokia
+visoks
+vos
+vël
+vëlgi
+ypaè
+á
+ákypai
+ástriþai
+ðalia
+ðe
+ði
+ðiaisiais
+ðiajai
+ðiajam
+ðiajame
+ðiapus
+ðiedvi
+ðieji
+ðiesiems
+ðioji
+ðiojo
+ðiojoje
+ðiokia
+ðioks
+ðiosiomis
+ðiosioms
+ðiosios
+ðiosiose
+ðis
+ðisai
+ðit
+ðita
+ðitas
+ðitiedvi
+ðitokia
+ðitoks
+ðituodu
+ðiuodu
+ðiuoju
+ðiuosiuose
+ðiuosius
+ðiàja
+ðiàjà
+ðiàsias
+ðiøjø
+ðtai
+ðájá
+þemiau
\ No newline at end of file
--- /dev/null
+aiz
+ap
+apakš
+apakšpus
+ar
+arī
+augšpus
+bet
+bez
+bija
+biji
+biju
+bijām
+bijāt
+būs
+būsi
+būsiet
+būsim
+būt
+būšu
+caur
+diemžēl
+diezin
+droši
+dēļ
+esam
+esat
+esi
+esmu
+gan
+gar
+iekam
+iekams
+iekām
+iekāms
+iekš
+iekšpus
+ik
+ir
+it
+itin
+iz
+ja
+jau
+jeb
+jebšu
+jel
+jo
+jā
+ka
+kamēr
+kaut
+kolīdz
+kopš
+kā
+kļuva
+kļuvi
+kļuvu
+kļuvām
+kļuvāt
+kļūs
+kļūsi
+kļūsiet
+kļūsim
+kļūst
+kļūstam
+kļūstat
+kļūsti
+kļūstu
+kļūt
+kļūšu
+labad
+lai
+lejpus
+līdz
+līdzko
+ne
+nebūt
+nedz
+nekā
+nevis
+nezin
+no
+nu
+nē
+otrpus
+pa
+par
+pat
+pie
+pirms
+pret
+priekš
+pār
+pēc
+starp
+tad
+tak
+tapi
+taps
+tapsi
+tapsiet
+tapsim
+tapt
+tapāt
+tapšu
+taču
+te
+tiec
+tiek
+tiekam
+tiekat
+tieku
+tik
+tika
+tikai
+tiki
+tikko
+tiklab
+tiklīdz
+tiks
+tiksiet
+tiksim
+tikt
+tiku
+tikvien
+tikām
+tikāt
+tikšu
+tomēr
+topat
+turpretim
+turpretī
+tā
+tādēļ
+tālab
+tāpēc
+un
+uz
+vai
+var
+varat
+varēja
+varēji
+varēju
+varējām
+varējāt
+varēs
+varēsi
+varēsiet
+varēsim
+varēt
+varēšu
+vien
+virs
+virspus
+vis
+viņpus
+zem
+ārpus
+šaipus
\ No newline at end of file
--- /dev/null
+अधिक
+अनेक
+अशी
+असलयाचे
+असलेल्या
+असा
+असून
+असे
+आज
+आणि
+आता
+आपल्या
+आला
+आली
+आले
+आहे
+आहेत
+एक
+एका
+कमी
+करणयात
+करून
+का
+काम
+काय
+काही
+किवा
+की
+केला
+केली
+केले
+कोटी
+गेल्या
+घेऊन
+जात
+झाला
+झाली
+झाले
+झालेल्या
+टा
+डॉ
+तर
+तरी
+तसेच
+ता
+ती
+तीन
+ते
+तो
+त्या
+त्याचा
+त्याची
+त्याच्या
+त्याना
+त्यानी
+त्यामुळे
+त्री
+दिली
+दोन
+न
+नाही
+निर्ण्य
+पण
+पम
+परयतन
+पाटील
+म
+मात्र
+माहिती
+मी
+मुबी
+म्हणजे
+म्हणाले
+म्हणून
+या
+याचा
+याची
+याच्या
+याना
+यानी
+येणार
+येत
+येथील
+येथे
+लाख
+व
+व्यकत
+सर्व
+सागित्ले
+सुरू
+हजार
+हा
+ही
+हे
+होणार
+होत
+होता
+होती
+होते
\ No newline at end of file
--- /dev/null
+abdul
+abdullah
+acara
+ada
+adalah
+ahmad
+air
+akan
+akhbar
+akhir
+aktiviti
+alam
+amat
+amerika
+anak
+anggota
+antara
+antarabangsa
+apa
+apabila
+april
+as
+asas
+asean
+asia
+asing
+atas
+atau
+australia
+awal
+awam
+bagaimanapun
+bagi
+bahagian
+bahan
+baharu
+bahawa
+baik
+bandar
+bank
+banyak
+barangan
+baru
+baru-baru
+bawah
+beberapa
+bekas
+beliau
+belum
+berada
+berakhir
+berbanding
+berdasarkan
+berharap
+berikutan
+berjaya
+berjumlah
+berkaitan
+berkata
+berkenaan
+berlaku
+bermula
+bernama
+bernilai
+bersama
+berubah
+besar
+bhd
+bidang
+bilion
+bn
+boleh
+bukan
+bulan
+bursa
+cadangan
+china
+dagangan
+dalam
+dan
+dana
+dapat
+dari
+daripada
+dasar
+datang
+datuk
+demikian
+dengan
+depan
+derivatives
+dewan
+di
+diadakan
+dibuka
+dicatatkan
+dijangka
+diniagakan
+dis
+disember
+ditutup
+dolar
+dr
+dua
+dunia
+ekonomi
+eksekutif
+eksport
+empat
+enam
+faedah
+feb
+global
+hadapan
+hanya
+harga
+hari
+hasil
+hingga
+hubungan
+ia
+iaitu
+ialah
+indeks
+india
+indonesia
+industri
+ini
+islam
+isnin
+isu
+itu
+jabatan
+jalan
+jan
+jawatan
+jawatankuasa
+jepun
+jika
+jualan
+juga
+julai
+jumaat
+jumlah
+jun
+juta
+kadar
+kalangan
+kali
+kami
+kata
+katanya
+kaunter
+kawasan
+ke
+keadaan
+kecil
+kedua
+kedua-dua
+kedudukan
+kekal
+kementerian
+kemudahan
+kenaikan
+kenyataan
+kepada
+kepentingan
+keputusan
+kerajaan
+kerana
+kereta
+kerja
+kerjasama
+kes
+keselamatan
+keseluruhan
+kesihatan
+ketika
+ketua
+keuntungan
+kewangan
+khamis
+kini
+kira-kira
+kita
+klci
+klibor
+komposit
+kontrak
+kos
+kuala
+kuasa
+kukuh
+kumpulan
+lagi
+lain
+langkah
+laporan
+lebih
+lepas
+lima
+lot
+luar
+lumpur
+mac
+mahkamah
+mahu
+majlis
+makanan
+maklumat
+malam
+malaysia
+mana
+manakala
+masa
+masalah
+masih
+masing-masing
+masyarakat
+mata
+media
+mei
+melalui
+melihat
+memandangkan
+memastikan
+membantu
+membawa
+memberi
+memberikan
+membolehkan
+membuat
+mempunyai
+menambah
+menarik
+menawarkan
+mencapai
+mencatatkan
+mendapat
+mendapatkan
+menerima
+menerusi
+mengadakan
+mengambil
+mengenai
+menggalakkan
+menggunakan
+mengikut
+mengumumkan
+mengurangkan
+meningkat
+meningkatkan
+menjadi
+menjelang
+menokok
+menteri
+menunjukkan
+menurut
+menyaksikan
+menyediakan
+mereka
+merosot
+merupakan
+mesyuarat
+minat
+minggu
+minyak
+modal
+mohd
+mudah
+mungkin
+naik
+najib
+nasional
+negara
+negara-negara
+negeri
+niaga
+nilai
+nov
+ogos
+okt
+oleh
+operasi
+orang
+pada
+pagi
+paling
+pameran
+papan
+para
+paras
+parlimen
+parti
+pasaran
+pasukan
+pegawai
+pejabat
+pekerja
+pelabur
+pelaburan
+pelancongan
+pelanggan
+pelbagai
+peluang
+pembangunan
+pemberita
+pembinaan
+pemimpin
+pendapatan
+pendidikan
+penduduk
+penerbangan
+pengarah
+pengeluaran
+pengerusi
+pengguna
+pengurusan
+peniaga
+peningkatan
+penting
+peratus
+perdagangan
+perdana
+peringkat
+perjanjian
+perkara
+perkhidmatan
+perladangan
+perlu
+permintaan
+perniagaan
+persekutuan
+persidangan
+pertama
+pertubuhan
+pertumbuhan
+perusahaan
+peserta
+petang
+pihak
+pilihan
+pinjaman
+polis
+politik
+presiden
+prestasi
+produk
+program
+projek
+proses
+proton
+pukul
+pula
+pusat
+rabu
+rakan
+rakyat
+ramai
+rantau
+raya
+rendah
+ringgit
+rumah
+sabah
+sahaja
+saham
+sama
+sarawak
+satu
+sawit
+saya
+sdn
+sebagai
+sebahagian
+sebanyak
+sebarang
+sebelum
+sebelumnya
+sebuah
+secara
+sedang
+segi
+sehingga
+sejak
+sekarang
+sektor
+sekuriti
+selain
+selama
+selasa
+selatan
+selepas
+seluruh
+semakin
+semalam
+semasa
+sementara
+semua
+semula
+sen
+sendiri
+seorang
+sepanjang
+seperti
+sept
+september
+serantau
+seri
+serta
+sesi
+setiap
+setiausaha
+sidang
+singapura
+sini
+sistem
+sokongan
+sri
+sudah
+sukan
+suku
+sumber
+supaya
+susut
+syarikat
+syed
+tahap
+tahun
+tan
+tanah
+tanpa
+tawaran
+teknologi
+telah
+tempat
+tempatan
+tempoh
+tenaga
+tengah
+tentang
+terbaik
+terbang
+terbesar
+terbuka
+terdapat
+terhadap
+termasuk
+tersebut
+terus
+tetapi
+thailand
+tiada
+tidak
+tiga
+timbalan
+timur
+tindakan
+tinggi
+tun
+tunai
+turun
+turut
+umno
+unit
+untuk
+untung
+urus
+usaha
+utama
+walaupun
+wang
+wanita
+wilayah
+yang
\ No newline at end of file
--- /dev/null
+aan
+aangaande
+aangezien
+achte
+achter
+achterna
+af
+afgelopen
+al
+aldaar
+aldus
+alhoewel
+alias
+alle
+allebei
+alleen
+alles
+als
+alsnog
+altijd
+altoos
+ander
+andere
+anders
+anderszins
+beetje
+behalve
+behoudens
+beide
+beiden
+ben
+beneden
+bent
+bepaald
+betreffende
+bij
+bijna
+bijv
+binnen
+binnenin
+blijkbaar
+blijken
+boven
+bovenal
+bovendien
+bovengenoemd
+bovenstaand
+bovenvermeld
+buiten
+bv
+daar
+daardoor
+daarheen
+daarin
+daarna
+daarnet
+daarom
+daarop
+daaruit
+daarvanlangs
+dan
+dat
+de
+deden
+deed
+der
+derde
+derhalve
+dertig
+deze
+dhr
+die
+dikwijls
+dit
+doch
+doe
+doen
+doet
+door
+doorgaand
+drie
+duizend
+dus
+echter
+een
+eens
+eer
+eerdat
+eerder
+eerlang
+eerst
+eerste
+eigen
+eigenlijk
+elk
+elke
+en
+enig
+enige
+enigszins
+enkel
+er
+erdoor
+erg
+ergens
+etc
+etcetera
+even
+eveneens
+evenwel
+gauw
+ge
+gedurende
+geen
+gehad
+gekund
+geleden
+gelijk
+gemoeten
+gemogen
+genoeg
+geweest
+gewoon
+gewoonweg
+haar
+haarzelf
+had
+hadden
+hare
+heb
+hebben
+hebt
+hedden
+heeft
+heel
+hem
+hemzelf
+hen
+het
+hetzelfde
+hier
+hierbeneden
+hierboven
+hierin
+hierna
+hierom
+hij
+hijzelf
+hoe
+hoewel
+honderd
+hun
+hunne
+ieder
+iedere
+iedereen
+iemand
+iets
+ik
+ikzelf
+in
+inderdaad
+inmiddels
+intussen
+inzake
+is
+ja
+je
+jezelf
+jij
+jijzelf
+jou
+jouw
+jouwe
+juist
+jullie
+kan
+klaar
+kon
+konden
+krachtens
+kun
+kunnen
+kunt
+laatst
+later
+liever
+lijken
+lijkt
+maak
+maakt
+maakte
+maakten
+maar
+mag
+maken
+me
+meer
+meest
+meestal
+men
+met
+mevr
+mezelf
+mij
+mijn
+mijnent
+mijner
+mijzelf
+minder
+miss
+misschien
+missen
+mits
+mocht
+mochten
+moest
+moesten
+moet
+moeten
+mogen
+mr
+mrs
+mw
+na
+naar
+nadat
+nam
+namelijk
+nee
+neem
+negen
+nemen
+nergens
+net
+niemand
+niet
+niets
+niks
+noch
+nochtans
+nog
+nogal
+nooit
+nu
+nv
+of
+ofschoon
+om
+omdat
+omhoog
+omlaag
+omstreeks
+omtrent
+omver
+ondanks
+onder
+ondertussen
+ongeveer
+ons
+onszelf
+onze
+onzeker
+ooit
+ook
+op
+opnieuw
+opzij
+over
+overal
+overeind
+overige
+overigens
+paar
+pas
+per
+precies
+recent
+redelijk
+reeds
+rond
+rondom
+samen
+sedert
+sinds
+sindsdien
+slechts
+sommige
+spoedig
+steeds
+tamelijk
+te
+tegen
+tegenover
+tenzij
+terwijl
+thans
+tien
+tiende
+tijdens
+tja
+toch
+toe
+toen
+toenmaals
+toenmalig
+tot
+totdat
+tussen
+twee
+tweede
+u
+uit
+uitgezonderd
+uw
+vaak
+vaakwat
+van
+vanaf
+vandaan
+vanuit
+vanwege
+veel
+veeleer
+veertig
+verder
+verscheidene
+verschillende
+vervolgens
+via
+vier
+vierde
+vijf
+vijfde
+vijftig
+vol
+volgend
+volgens
+voor
+vooraf
+vooral
+vooralsnog
+voorbij
+voordat
+voordezen
+voordien
+voorheen
+voorop
+voorts
+vooruit
+vrij
+vroeg
+waar
+waarom
+waarschijnlijk
+wanneer
+want
+waren
+was
+wat
+we
+wederom
+weer
+weg
+wegens
+weinig
+wel
+weldra
+welk
+welke
+werd
+werden
+werder
+wezen
+whatever
+wie
+wiens
+wier
+wij
+wijzelf
+wil
+wilden
+willen
+word
+worden
+wordt
+zal
+ze
+zei
+zeker
+zelf
+zelfde
+zelfs
+zes
+zeven
+zich
+zichzelf
+zij
+zijn
+zijne
+zijzelf
+zo
+zoals
+zodat
+zodra
+zonder
+zou
+zouden
+zowat
+zulk
+zulke
+zullen
+zult
\ No newline at end of file
--- /dev/null
+alle
+andre
+arbeid
+at
+av
+bare
+begge
+ble
+blei
+bli
+blir
+blitt
+bort
+bra
+bruke
+både
+båe
+da
+de
+deg
+dei
+deim
+deira
+deires
+dem
+den
+denne
+der
+dere
+deres
+det
+dette
+di
+din
+disse
+ditt
+du
+dykk
+dykkar
+då
+eg
+ein
+eit
+eitt
+eller
+elles
+en
+ene
+eneste
+enhver
+enn
+er
+et
+ett
+etter
+folk
+for
+fordi
+forsûke
+fra
+få
+før
+fûr
+fûrst
+gjorde
+gjûre
+god
+gå
+ha
+hadde
+han
+hans
+har
+hennar
+henne
+hennes
+her
+hjå
+ho
+hoe
+honom
+hoss
+hossen
+hun
+hva
+hvem
+hver
+hvilke
+hvilken
+hvis
+hvor
+hvordan
+hvorfor
+i
+ikke
+ikkje
+ingen
+ingi
+inkje
+inn
+innen
+inni
+ja
+jeg
+kan
+kom
+korleis
+korso
+kun
+kunne
+kva
+kvar
+kvarhelst
+kven
+kvi
+kvifor
+lage
+lang
+lik
+like
+makt
+man
+mange
+me
+med
+medan
+meg
+meget
+mellom
+men
+mens
+mer
+mest
+mi
+min
+mine
+mitt
+mot
+mye
+mykje
+må
+måte
+navn
+ned
+nei
+no
+noe
+noen
+noka
+noko
+nokon
+nokor
+nokre
+ny
+nå
+når
+og
+også
+om
+opp
+oss
+over
+part
+punkt
+på
+rett
+riktig
+samme
+sant
+seg
+selv
+si
+sia
+sidan
+siden
+sin
+sine
+sist
+sitt
+sjøl
+skal
+skulle
+slik
+slutt
+so
+som
+somme
+somt
+start
+stille
+så
+sånn
+tid
+til
+tilbake
+tilstand
+um
+under
+upp
+ut
+uten
+var
+vart
+varte
+ved
+verdi
+vere
+verte
+vi
+vil
+ville
+vite
+vore
+vors
+vort
+vår
+være
+vært
+vöre
+vört
+å
\ No newline at end of file
--- /dev/null
+a
+aby
+ach
+acz
+aczkolwiek
+aj
+albo
+ale
+ależ
+ani
+aż
+bardziej
+bardzo
+bez
+bo
+bowiem
+by
+byli
+bym
+bynajmniej
+być
+był
+była
+było
+były
+będzie
+będą
+cali
+cała
+cały
+chce
+choć
+ci
+ciebie
+cię
+co
+cokolwiek
+coraz
+coś
+czasami
+czasem
+czemu
+czy
+czyli
+często
+daleko
+dla
+dlaczego
+dlatego
+do
+dobrze
+dokąd
+dość
+dr
+dużo
+dwa
+dwaj
+dwie
+dwoje
+dzisiaj
+dziś
+gdy
+gdyby
+gdyż
+gdzie
+gdziekolwiek
+gdzieś
+go
+godz
+hab
+i
+ich
+ii
+iii
+ile
+im
+inna
+inne
+inny
+innych
+inż
+iv
+ix
+iż
+ja
+jak
+jakaś
+jakby
+jaki
+jakichś
+jakie
+jakiś
+jakiż
+jakkolwiek
+jako
+jakoś
+je
+jeden
+jedna
+jednak
+jednakże
+jedno
+jednym
+jedynie
+jego
+jej
+jemu
+jest
+jestem
+jeszcze
+jeśli
+jeżeli
+już
+ją
+każdy
+kiedy
+kierunku
+kilka
+kilku
+kimś
+kto
+ktokolwiek
+ktoś
+która
+które
+którego
+której
+który
+których
+którym
+którzy
+ku
+lat
+lecz
+lub
+ma
+mają
+mam
+mamy
+mało
+mgr
+mi
+miał
+mimo
+między
+mnie
+mną
+mogą
+moi
+moim
+moja
+moje
+może
+możliwe
+można
+mu
+musi
+my
+mój
+na
+nad
+nam
+nami
+nas
+nasi
+nasz
+nasza
+nasze
+naszego
+naszych
+natomiast
+natychmiast
+nawet
+nic
+nich
+nie
+niech
+niego
+niej
+niemu
+nigdy
+nim
+nimi
+nią
+niż
+no
+nowe
+np
+nr
+o
+o.o.
+obok
+od
+ok
+około
+on
+ona
+one
+oni
+ono
+oraz
+oto
+owszem
+pan
+pana
+pani
+pl
+po
+pod
+podczas
+pomimo
+ponad
+ponieważ
+powinien
+powinna
+powinni
+powinno
+poza
+prawie
+prof
+przecież
+przed
+przede
+przedtem
+przez
+przy
+raz
+razie
+roku
+również
+sam
+sama
+się
+skąd
+sobie
+sobą
+sposób
+swoje
+są
+ta
+tak
+taka
+taki
+takich
+takie
+także
+tam
+te
+tego
+tej
+tel
+temu
+ten
+teraz
+też
+to
+tobie
+tobą
+toteż
+totobą
+trzeba
+tu
+tutaj
+twoi
+twoim
+twoja
+twoje
+twym
+twój
+ty
+tych
+tylko
+tym
+tys
+tzw
+tę
+u
+ul
+vi
+vii
+viii
+vol
+w
+wam
+wami
+was
+wasi
+wasz
+wasza
+wasze
+we
+według
+wie
+wiele
+wielu
+więc
+więcej
+wszyscy
+wszystkich
+wszystkie
+wszystkim
+wszystko
+wtedy
+www
+wy
+właśnie
+wśród
+xi
+xii
+xiii
+xiv
+xv
+z
+za
+zapewne
+zawsze
+zaś
+ze
+zeznowu
+znowu
+znów
+został
+zł
+żaden
+żadna
+żadne
+żadnych
+że
+żeby
\ No newline at end of file
--- /dev/null
+a
+acerca
+adeus
+agora
+ainda
+alem
+algmas
+algo
+algumas
+alguns
+ali
+além
+ambas
+ambos
+ano
+anos
+antes
+ao
+aonde
+aos
+apenas
+apoio
+apontar
+apos
+após
+aquela
+aquelas
+aquele
+aqueles
+aqui
+aquilo
+as
+assim
+através
+atrás
+até
+aí
+baixo
+bastante
+bem
+boa
+boas
+bom
+bons
+breve
+cada
+caminho
+catorze
+cedo
+cento
+certamente
+certeza
+cima
+cinco
+coisa
+com
+como
+comprido
+conhecido
+conselho
+contra
+contudo
+corrente
+cuja
+cujas
+cujo
+cujos
+custa
+cá
+da
+daquela
+daquelas
+daquele
+daqueles
+dar
+das
+de
+debaixo
+dela
+delas
+dele
+deles
+demais
+dentro
+depois
+desde
+desligado
+dessa
+dessas
+desse
+desses
+desta
+destas
+deste
+destes
+deve
+devem
+deverá
+dez
+dezanove
+dezasseis
+dezassete
+dezoito
+dia
+diante
+direita
+dispoe
+dispoem
+diversa
+diversas
+diversos
+diz
+dizem
+dizer
+do
+dois
+dos
+doze
+duas
+durante
+dá
+dão
+dúvida
+e
+ela
+elas
+ele
+eles
+em
+embora
+enquanto
+entao
+entre
+então
+era
+eram
+essa
+essas
+esse
+esses
+esta
+estado
+estamos
+estar
+estará
+estas
+estava
+estavam
+este
+esteja
+estejam
+estejamos
+estes
+esteve
+estive
+estivemos
+estiver
+estivera
+estiveram
+estiverem
+estivermos
+estivesse
+estivessem
+estiveste
+estivestes
+estivéramos
+estivéssemos
+estou
+está
+estás
+estávamos
+estão
+eu
+exemplo
+falta
+fará
+favor
+faz
+fazeis
+fazem
+fazemos
+fazer
+fazes
+fazia
+faço
+fez
+fim
+final
+foi
+fomos
+for
+fora
+foram
+forem
+forma
+formos
+fosse
+fossem
+foste
+fostes
+fui
+fôramos
+fôssemos
+geral
+grande
+grandes
+grupo
+ha
+haja
+hajam
+hajamos
+havemos
+havia
+hei
+hoje
+hora
+horas
+houve
+houvemos
+houver
+houvera
+houveram
+houverei
+houverem
+houveremos
+houveria
+houveriam
+houvermos
+houverá
+houverão
+houveríamos
+houvesse
+houvessem
+houvéramos
+houvéssemos
+há
+hão
+iniciar
+inicio
+ir
+irá
+isso
+ista
+iste
+isto
+já
+lado
+lhe
+lhes
+ligado
+local
+logo
+longe
+lugar
+lá
+maior
+maioria
+maiorias
+mais
+mal
+mas
+me
+mediante
+meio
+menor
+menos
+meses
+mesma
+mesmas
+mesmo
+mesmos
+meu
+meus
+mil
+minha
+minhas
+momento
+muito
+muitos
+máximo
+mês
+na
+nada
+nao
+naquela
+naquelas
+naquele
+naqueles
+nas
+nem
+nenhuma
+nessa
+nessas
+nesse
+nesses
+nesta
+nestas
+neste
+nestes
+no
+noite
+nome
+nos
+nossa
+nossas
+nosso
+nossos
+nova
+novas
+nove
+novo
+novos
+num
+numa
+numas
+nunca
+nuns
+não
+nível
+nós
+número
+o
+obra
+obrigada
+obrigado
+oitava
+oitavo
+oito
+onde
+ontem
+onze
+os
+ou
+outra
+outras
+outro
+outros
+para
+parece
+parte
+partir
+paucas
+pegar
+pela
+pelas
+pelo
+pelos
+perante
+perto
+pessoas
+pode
+podem
+poder
+poderá
+podia
+pois
+ponto
+pontos
+por
+porque
+porquê
+portanto
+posição
+possivelmente
+posso
+possível
+pouca
+pouco
+poucos
+povo
+primeira
+primeiras
+primeiro
+primeiros
+promeiro
+propios
+proprio
+própria
+próprias
+próprio
+próprios
+próxima
+próximas
+próximo
+próximos
+puderam
+pôde
+põe
+põem
+quais
+qual
+qualquer
+quando
+quanto
+quarta
+quarto
+quatro
+que
+quem
+quer
+quereis
+querem
+queremas
+queres
+quero
+questão
+quieto
+quinta
+quinto
+quinze
+quáis
+quê
+relação
+sabe
+sabem
+saber
+se
+segunda
+segundo
+sei
+seis
+seja
+sejam
+sejamos
+sem
+sempre
+sendo
+ser
+serei
+seremos
+seria
+seriam
+será
+serão
+seríamos
+sete
+seu
+seus
+sexta
+sexto
+sim
+sistema
+sob
+sobre
+sois
+somente
+somos
+sou
+sua
+suas
+são
+sétima
+sétimo
+só
+tal
+talvez
+tambem
+também
+tanta
+tantas
+tanto
+tarde
+te
+tem
+temos
+tempo
+tendes
+tenha
+tenham
+tenhamos
+tenho
+tens
+tentar
+tentaram
+tente
+tentei
+ter
+terceira
+terceiro
+terei
+teremos
+teria
+teriam
+terá
+terão
+teríamos
+teu
+teus
+teve
+tinha
+tinham
+tipo
+tive
+tivemos
+tiver
+tivera
+tiveram
+tiverem
+tivermos
+tivesse
+tivessem
+tiveste
+tivestes
+tivéramos
+tivéssemos
+toda
+todas
+todo
+todos
+trabalhar
+trabalho
+treze
+três
+tu
+tua
+tuas
+tudo
+tão
+tém
+têm
+tínhamos
+um
+uma
+umas
+uns
+usa
+usar
+vai
+vais
+valor
+veja
+vem
+vens
+ver
+verdade
+verdadeiro
+vez
+vezes
+viagem
+vindo
+vinte
+você
+vocês
+vos
+vossa
+vossas
+vosso
+vossos
+vários
+vão
+vêm
+vós
+zero
+à
+às
+área
+é
+éramos
+és
+último
\ No newline at end of file
--- /dev/null
+a
+abia
+acea
+aceasta
+această
+aceea
+aceeasi
+acei
+aceia
+acel
+acela
+acelasi
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+acestei
+acestia
+acestui
+aceşti
+aceştia
+acolo
+acord
+acum
+adica
+ai
+aia
+aibă
+aici
+aiurea
+al
+ala
+alaturi
+ale
+alea
+alt
+alta
+altceva
+altcineva
+alte
+altfel
+alti
+altii
+altul
+am
+anume
+apoi
+ar
+are
+as
+asa
+asemenea
+asta
+astazi
+astea
+astfel
+astăzi
+asupra
+atare
+atat
+atata
+atatea
+atatia
+ati
+atit
+atita
+atitea
+atitia
+atunci
+au
+avea
+avem
+aveţi
+avut
+azi
+aş
+aşadar
+aţi
+b
+ba
+bine
+bucur
+bună
+c
+ca
+cam
+cand
+capat
+care
+careia
+carora
+caruia
+cat
+catre
+caut
+ce
+cea
+ceea
+cei
+ceilalti
+cel
+cele
+celor
+ceva
+chiar
+ci
+cinci
+cind
+cine
+cineva
+cit
+cita
+cite
+citeva
+citi
+citiva
+conform
+contra
+cu
+cui
+cum
+cumva
+curând
+curînd
+când
+cât
+câte
+câtva
+câţi
+cînd
+cît
+cîte
+cîtva
+cîţi
+că
+căci
+cărei
+căror
+cărui
+către
+d
+da
+daca
+dacă
+dar
+dat
+datorită
+dată
+dau
+de
+deasupra
+deci
+decit
+degraba
+deja
+deoarece
+departe
+desi
+despre
+deşi
+din
+dinaintea
+dintr
+dintr-
+dintre
+doar
+doi
+doilea
+două
+drept
+dupa
+după
+dă
+e
+ea
+ei
+el
+ele
+era
+eram
+este
+eu
+exact
+eşti
+f
+face
+fara
+fata
+fel
+fi
+fie
+fiecare
+fii
+fim
+fiu
+fiţi
+foarte
+fost
+frumos
+fără
+g
+geaba
+graţie
+h
+halbă
+i
+ia
+iar
+ieri
+ii
+il
+imi
+in
+inainte
+inapoi
+inca
+incit
+insa
+intr
+intre
+isi
+iti
+j
+k
+l
+la
+le
+li
+lor
+lui
+lângă
+lîngă
+m
+ma
+mai
+mare
+mea
+mei
+mele
+mereu
+meu
+mi
+mie
+mine
+mod
+mult
+multa
+multe
+multi
+multă
+mulţi
+mulţumesc
+mâine
+mîine
+mă
+n
+ne
+nevoie
+ni
+nici
+niciodata
+nicăieri
+nimeni
+nimeri
+nimic
+niste
+nişte
+noastre
+noastră
+noi
+noroc
+nostri
+nostru
+nou
+noua
+nouă
+noştri
+nu
+numai
+o
+opt
+or
+ori
+oricare
+orice
+oricine
+oricum
+oricând
+oricât
+oricînd
+oricît
+oriunde
+p
+pai
+parca
+patra
+patru
+patrulea
+pe
+pentru
+peste
+pic
+pina
+plus
+poate
+pot
+prea
+prima
+primul
+prin
+printr-
+putini
+puţin
+puţina
+puţină
+până
+pînă
+r
+rog
+s
+sa
+sa-mi
+sa-ti
+sai
+sale
+sau
+se
+si
+sint
+sintem
+spate
+spre
+sub
+sunt
+suntem
+sunteţi
+sus
+sută
+sînt
+sîntem
+sînteţi
+să
+săi
+său
+t
+ta
+tale
+te
+ti
+timp
+tine
+toata
+toate
+toată
+tocmai
+tot
+toti
+totul
+totusi
+totuşi
+toţi
+trei
+treia
+treilea
+tu
+tuturor
+tăi
+tău
+u
+ul
+ului
+un
+una
+unde
+undeva
+unei
+uneia
+unele
+uneori
+unii
+unor
+unora
+unu
+unui
+unuia
+unul
+v
+va
+vi
+voastre
+voastră
+voi
+vom
+vor
+vostru
+vouă
+voştri
+vreme
+vreo
+vreun
+vă
+x
+z
+zece
+zero
+zi
+zice
+îi
+îl
+îmi
+împotriva
+în
+înainte
+înaintea
+încotro
+încât
+încît
+între
+întrucât
+întrucît
+îţi
+ăla
+ălea
+ăsta
+ăstea
+ăştia
+şapte
+şase
+şi
+ştiu
+ţi
+ţie
\ No newline at end of file
--- /dev/null
+c
+а
+алло
+без
+белый
+близко
+более
+больше
+большой
+будем
+будет
+будете
+будешь
+будто
+буду
+будут
+будь
+бы
+бывает
+бывь
+был
+была
+были
+было
+быть
+в
+важная
+важное
+важные
+важный
+вам
+вами
+вас
+ваш
+ваша
+ваше
+ваши
+вверх
+вдали
+вдруг
+ведь
+везде
+вернуться
+весь
+вечер
+взгляд
+взять
+вид
+видел
+видеть
+вместе
+вне
+вниз
+внизу
+во
+вода
+война
+вокруг
+вон
+вообще
+вопрос
+восемнадцатый
+восемнадцать
+восемь
+восьмой
+вот
+впрочем
+времени
+время
+все
+все еще
+всегда
+всего
+всем
+всеми
+всему
+всех
+всею
+всю
+всюду
+вся
+всё
+второй
+вы
+выйти
+г
+где
+главный
+глаз
+говорил
+говорит
+говорить
+год
+года
+году
+голова
+голос
+город
+да
+давать
+давно
+даже
+далекий
+далеко
+дальше
+даром
+дать
+два
+двадцатый
+двадцать
+две
+двенадцатый
+двенадцать
+дверь
+двух
+девятнадцатый
+девятнадцать
+девятый
+девять
+действительно
+дел
+делал
+делать
+делаю
+дело
+день
+деньги
+десятый
+десять
+для
+до
+довольно
+долго
+должен
+должно
+должный
+дом
+дорога
+друг
+другая
+другие
+других
+друго
+другое
+другой
+думать
+душа
+е
+его
+ее
+ей
+ему
+если
+есть
+еще
+ещё
+ею
+её
+ж
+ждать
+же
+жена
+женщина
+жизнь
+жить
+за
+занят
+занята
+занято
+заняты
+затем
+зато
+зачем
+здесь
+земля
+знать
+значит
+значить
+и
+иди
+идти
+из
+или
+им
+имеет
+имел
+именно
+иметь
+ими
+имя
+иногда
+их
+к
+каждая
+каждое
+каждые
+каждый
+кажется
+казаться
+как
+какая
+какой
+кем
+книга
+когда
+кого
+ком
+комната
+кому
+конец
+конечно
+которая
+которого
+которой
+которые
+который
+которых
+кроме
+кругом
+кто
+куда
+лежать
+лет
+ли
+лицо
+лишь
+лучше
+любить
+люди
+м
+маленький
+мало
+мать
+машина
+между
+меля
+менее
+меньше
+меня
+место
+миллионов
+мимо
+минута
+мир
+мира
+мне
+много
+многочисленная
+многочисленное
+многочисленные
+многочисленный
+мной
+мною
+мог
+могу
+могут
+мож
+может
+может быть
+можно
+можхо
+мои
+мой
+мор
+москва
+мочь
+моя
+моё
+мы
+на
+наверху
+над
+надо
+назад
+наиболее
+найти
+наконец
+нам
+нами
+народ
+нас
+начала
+начать
+наш
+наша
+наше
+наши
+не
+него
+недавно
+недалеко
+нее
+ней
+некоторый
+нельзя
+нем
+немного
+нему
+непрерывно
+нередко
+несколько
+нет
+нею
+неё
+ни
+нибудь
+ниже
+низко
+никакой
+никогда
+никто
+никуда
+ним
+ними
+них
+ничего
+ничто
+но
+новый
+нога
+ночь
+ну
+нужно
+нужный
+нх
+о
+об
+оба
+обычно
+один
+одиннадцатый
+одиннадцать
+однажды
+однако
+одного
+одной
+оказаться
+окно
+около
+он
+она
+они
+оно
+опять
+особенно
+остаться
+от
+ответить
+отец
+откуда
+отовсюду
+отсюда
+очень
+первый
+перед
+писать
+плечо
+по
+под
+подойди
+подумать
+пожалуйста
+позже
+пойти
+пока
+пол
+получить
+помнить
+понимать
+понять
+пор
+пора
+после
+последний
+посмотреть
+посреди
+потом
+потому
+почему
+почти
+правда
+прекрасно
+при
+про
+просто
+против
+процентов
+путь
+пятнадцатый
+пятнадцать
+пятый
+пять
+работа
+работать
+раз
+разве
+рано
+раньше
+ребенок
+решить
+россия
+рука
+русский
+ряд
+рядом
+с
+с кем
+сам
+сама
+сами
+самим
+самими
+самих
+само
+самого
+самой
+самом
+самому
+саму
+самый
+свет
+свое
+своего
+своей
+свои
+своих
+свой
+свою
+сделать
+сеаой
+себе
+себя
+сегодня
+седьмой
+сейчас
+семнадцатый
+семнадцать
+семь
+сидеть
+сила
+сих
+сказал
+сказала
+сказать
+сколько
+слишком
+слово
+случай
+смотреть
+сначала
+снова
+со
+собой
+собою
+советский
+совсем
+спасибо
+спросить
+сразу
+стал
+старый
+стать
+стол
+сторона
+стоять
+страна
+суть
+считать
+т
+та
+так
+такая
+также
+таки
+такие
+такое
+такой
+там
+твои
+твой
+твоя
+твоё
+те
+тебе
+тебя
+тем
+теми
+теперь
+тех
+то
+тобой
+тобою
+товарищ
+тогда
+того
+тоже
+только
+том
+тому
+тот
+тою
+третий
+три
+тринадцатый
+тринадцать
+ту
+туда
+тут
+ты
+тысяч
+у
+увидеть
+уж
+уже
+улица
+уметь
+утро
+хороший
+хорошо
+хотел бы
+хотеть
+хоть
+хотя
+хочешь
+час
+часто
+часть
+чаще
+чего
+человек
+чем
+чему
+через
+четвертый
+четыре
+четырнадцатый
+четырнадцать
+что
+чтоб
+чтобы
+чуть
+шестнадцатый
+шестнадцать
+шестой
+шесть
+эта
+эти
+этим
+этими
+этих
+это
+этого
+этой
+этом
+этому
+этот
+эту
+я
+являюсь
\ No newline at end of file
--- /dev/null
+a
+aby
+aj
+ak
+akej
+akejže
+ako
+akom
+akomže
+akou
+akouže
+akože
+aká
+akáže
+aké
+akého
+akéhože
+akému
+akémuže
+akéže
+akú
+akúže
+aký
+akých
+akýchže
+akým
+akými
+akýmiže
+akýmže
+akýže
+ale
+alebo
+ani
+asi
+avšak
+až
+ba
+bez
+bezo
+bol
+bola
+boli
+bolo
+bude
+budem
+budeme
+budete
+budeš
+budú
+buď
+by
+byť
+cez
+cezo
+dnes
+do
+ešte
+ho
+hoci
+i
+iba
+ich
+im
+inej
+inom
+iná
+iné
+iného
+inému
+iní
+inú
+iný
+iných
+iným
+inými
+ja
+je
+jeho
+jej
+jemu
+ju
+k
+kam
+kamže
+každou
+každá
+každé
+každého
+každému
+každí
+každú
+každý
+každých
+každým
+každými
+kde
+kej
+kejže
+keď
+keďže
+kie
+kieho
+kiehože
+kiemu
+kiemuže
+kieže
+koho
+kom
+komu
+kou
+kouže
+kto
+ktorej
+ktorou
+ktorá
+ktoré
+ktorí
+ktorú
+ktorý
+ktorých
+ktorým
+ktorými
+ku
+ká
+káže
+ké
+kéže
+kú
+kúže
+ký
+kýho
+kýhože
+kým
+kýmu
+kýmuže
+kýže
+lebo
+leda
+ledaže
+len
+ma
+majú
+mal
+mala
+mali
+mať
+medzi
+mi
+mne
+mnou
+moja
+moje
+mojej
+mojich
+mojim
+mojimi
+mojou
+moju
+možno
+mu
+musia
+musieť
+musí
+musím
+musíme
+musíte
+musíš
+my
+má
+mám
+máme
+máte
+máš
+môcť
+môj
+môjho
+môže
+môžem
+môžeme
+môžete
+môžeš
+môžu
+mňa
+na
+nad
+nado
+najmä
+nami
+naša
+naše
+našej
+naši
+našich
+našim
+našimi
+našou
+ne
+nech
+neho
+nej
+nejakej
+nejakom
+nejakou
+nejaká
+nejaké
+nejakého
+nejakému
+nejakú
+nejaký
+nejakých
+nejakým
+nejakými
+nemu
+než
+nich
+nie
+niektorej
+niektorom
+niektorou
+niektorá
+niektoré
+niektorého
+niektorému
+niektorú
+niektorý
+niektorých
+niektorým
+niektorými
+nielen
+niečo
+nim
+nimi
+nič
+ničoho
+ničom
+ničomu
+ničím
+no
+nám
+nás
+náš
+nášho
+ním
+o
+od
+odo
+on
+ona
+oni
+ono
+ony
+oň
+oňho
+po
+pod
+podo
+podľa
+pokiaľ
+popod
+popri
+potom
+poza
+pre
+pred
+predo
+preto
+pretože
+prečo
+pri
+práve
+s
+sa
+seba
+sebe
+sebou
+sem
+si
+sme
+so
+som
+ste
+svoj
+svoja
+svoje
+svojho
+svojich
+svojim
+svojimi
+svojou
+svoju
+svojím
+sú
+ta
+tak
+takej
+takejto
+taká
+takáto
+také
+takého
+takéhoto
+takému
+takémuto
+takéto
+takí
+takú
+takúto
+taký
+takýto
+takže
+tam
+teba
+tebe
+tebou
+teda
+tej
+tejto
+ten
+tento
+ti
+tie
+tieto
+tiež
+to
+toho
+tohoto
+tohto
+tom
+tomto
+tomu
+tomuto
+toto
+tou
+touto
+tu
+tvoj
+tvoja
+tvoje
+tvojej
+tvojho
+tvoji
+tvojich
+tvojim
+tvojimi
+tvojím
+ty
+tá
+táto
+tí
+títo
+tú
+túto
+tých
+tým
+tými
+týmto
+u
+už
+v
+vami
+vaša
+vaše
+vašej
+vaši
+vašich
+vašim
+vaším
+veď
+viac
+vo
+vy
+vám
+vás
+váš
+vášho
+však
+všetci
+všetka
+všetko
+všetky
+všetok
+z
+za
+začo
+začože
+zo
+áno
+čej
+či
+čia
+čie
+čieho
+čiemu
+čiu
+čo
+čoho
+čom
+čomu
+čou
+čože
+čí
+čím
+čími
+ďalšia
+ďalšie
+ďalšieho
+ďalšiemu
+ďalšiu
+ďalšom
+ďalšou
+ďalší
+ďalších
+ďalším
+ďalšími
+ňom
+ňou
+ňu
+že
\ No newline at end of file
--- /dev/null
+a
+ali
+april
+avgust
+b
+bi
+bil
+bila
+bile
+bili
+bilo
+biti
+blizu
+bo
+bodo
+bojo
+bolj
+bom
+bomo
+boste
+bova
+boš
+brez
+c
+cel
+cela
+celi
+celo
+d
+da
+daleč
+dan
+danes
+datum
+december
+deset
+deseta
+deseti
+deseto
+devet
+deveta
+deveti
+deveto
+do
+dober
+dobra
+dobri
+dobro
+dokler
+dol
+dolg
+dolga
+dolgi
+dovolj
+drug
+druga
+drugi
+drugo
+dva
+dve
+e
+eden
+en
+ena
+ene
+eni
+enkrat
+eno
+etc.
+f
+februar
+g
+g.
+ga
+ga.
+gor
+gospa
+gospod
+h
+halo
+i
+idr.
+ii
+iii
+in
+iv
+ix
+iz
+j
+januar
+jaz
+je
+ji
+jih
+jim
+jo
+julij
+junij
+jutri
+k
+kadarkoli
+kaj
+kajti
+kako
+kakor
+kamor
+kamorkoli
+kar
+karkoli
+katerikoli
+kdaj
+kdo
+kdorkoli
+ker
+ki
+kje
+kjer
+kjerkoli
+ko
+koder
+koderkoli
+koga
+komu
+kot
+kratek
+kratka
+kratke
+kratki
+l
+lahka
+lahke
+lahki
+lahko
+le
+lep
+lepa
+lepe
+lepi
+lepo
+leto
+m
+maj
+majhen
+majhna
+majhni
+malce
+malo
+manj
+marec
+me
+med
+medtem
+mene
+mesec
+mi
+midva
+midve
+mnogo
+moj
+moja
+moje
+mora
+morajo
+moram
+moramo
+morate
+moraš
+morem
+mu
+n
+na
+nad
+naj
+najina
+najino
+najmanj
+naju
+največ
+nam
+narobe
+nas
+nato
+nazaj
+naš
+naša
+naše
+ne
+nedavno
+nedelja
+nek
+neka
+nekaj
+nekatere
+nekateri
+nekatero
+nekdo
+neke
+nekega
+neki
+nekje
+neko
+nekoga
+nekoč
+ni
+nikamor
+nikdar
+nikjer
+nikoli
+nič
+nje
+njega
+njegov
+njegova
+njegovo
+njej
+njemu
+njen
+njena
+njeno
+nji
+njih
+njihov
+njihova
+njihovo
+njiju
+njim
+njo
+njun
+njuna
+njuno
+no
+nocoj
+november
+npr.
+o
+ob
+oba
+obe
+oboje
+od
+odprt
+odprta
+odprti
+okoli
+oktober
+on
+onadva
+one
+oni
+onidve
+osem
+osma
+osmi
+osmo
+oz.
+p
+pa
+pet
+peta
+petek
+peti
+peto
+po
+pod
+pogosto
+poleg
+poln
+polna
+polni
+polno
+ponavadi
+ponedeljek
+ponovno
+potem
+povsod
+pozdravljen
+pozdravljeni
+prav
+prava
+prave
+pravi
+pravo
+prazen
+prazna
+prazno
+prbl.
+precej
+pred
+prej
+preko
+pri
+pribl.
+približno
+primer
+pripravljen
+pripravljena
+pripravljeni
+proti
+prva
+prvi
+prvo
+r
+ravno
+redko
+res
+reč
+s
+saj
+sam
+sama
+same
+sami
+samo
+se
+sebe
+sebi
+sedaj
+sedem
+sedma
+sedmi
+sedmo
+sem
+september
+seveda
+si
+sicer
+skoraj
+skozi
+slab
+smo
+so
+sobota
+spet
+sreda
+srednja
+srednji
+sta
+ste
+stran
+stvar
+sva
+t
+ta
+tak
+taka
+take
+taki
+tako
+takoj
+tam
+te
+tebe
+tebi
+tega
+težak
+težka
+težki
+težko
+ti
+tista
+tiste
+tisti
+tisto
+tj.
+tja
+to
+toda
+torek
+tretja
+tretje
+tretji
+tri
+tu
+tudi
+tukaj
+tvoj
+tvoja
+tvoje
+u
+v
+vaju
+vam
+vas
+vaš
+vaša
+vaše
+ve
+vedno
+velik
+velika
+veliki
+veliko
+vendar
+ves
+več
+vi
+vidva
+vii
+viii
+visok
+visoka
+visoke
+visoki
+vsa
+vsaj
+vsak
+vsaka
+vsakdo
+vsake
+vsaki
+vsakomur
+vse
+vsega
+vsi
+vso
+včasih
+včeraj
+x
+z
+za
+zadaj
+zadnji
+zakaj
+zaprta
+zaprti
+zaprto
+zdaj
+zelo
+zunaj
+č
+če
+često
+četrta
+četrtek
+četrti
+četrto
+čez
+čigav
+š
+šest
+šesta
+šesti
+šesto
+štiri
+ž
+že
\ No newline at end of file
--- /dev/null
+aad
+albaabkii
+atabo
+ay
+ayaa
+ayee
+ayuu
+dhan
+hadana
+in
+inuu
+isku
+jiray
+jirtay
+ka
+kale
+kasoo
+ku
+kuu
+lakin
+markii
+oo
+si
+soo
+uga
+ugu
+uu
+waa
+waxa
+waxuu
\ No newline at end of file
--- /dev/null
+a
+ba
+bane
+bona
+e
+ea
+eaba
+empa
+ena
+ha
+hae
+hape
+ho
+hore
+ka
+ke
+la
+le
+li
+me
+mo
+moo
+ne
+o
+oa
+re
+sa
+se
+tloha
+tsa
+tse
\ No newline at end of file
--- /dev/null
+aderton
+adertonde
+adjö
+aldrig
+alla
+allas
+allt
+alltid
+alltså
+andra
+andras
+annan
+annat
+artonde
+artonn
+att
+av
+bakom
+bara
+behöva
+behövas
+behövde
+behövt
+beslut
+beslutat
+beslutit
+bland
+blev
+bli
+blir
+blivit
+bort
+borta
+bra
+bäst
+bättre
+båda
+bådas
+dag
+dagar
+dagarna
+dagen
+de
+del
+delen
+dem
+den
+denna
+deras
+dess
+dessa
+det
+detta
+dig
+din
+dina
+dit
+ditt
+dock
+dom
+du
+där
+därför
+då
+e
+efter
+eftersom
+ej
+elfte
+eller
+elva
+emot
+en
+enkel
+enkelt
+enkla
+enligt
+ens
+er
+era
+ers
+ert
+ett
+ettusen
+fanns
+fem
+femte
+femtio
+femtionde
+femton
+femtonde
+fick
+fin
+finnas
+finns
+fjorton
+fjortonde
+fjärde
+fler
+flera
+flesta
+fram
+framför
+från
+fyra
+fyrtio
+fyrtionde
+få
+får
+fått
+följande
+för
+före
+förlåt
+förra
+första
+genast
+genom
+gick
+gjorde
+gjort
+god
+goda
+godare
+godast
+gott
+gälla
+gäller
+gällt
+gärna
+gå
+går
+gått
+gör
+göra
+ha
+hade
+haft
+han
+hans
+har
+heller
+hellre
+helst
+helt
+henne
+hennes
+hit
+hon
+honom
+hundra
+hundraen
+hundraett
+hur
+här
+hög
+höger
+högre
+högst
+i
+ibland
+icke
+idag
+igen
+igår
+imorgon
+in
+inför
+inga
+ingen
+ingenting
+inget
+innan
+inne
+inom
+inte
+inuti
+ja
+jag
+jo
+ju
+just
+jämfört
+kan
+kanske
+knappast
+kom
+komma
+kommer
+kommit
+kr
+kunde
+kunna
+kunnat
+kvar
+legat
+ligga
+ligger
+lika
+likställd
+likställda
+lilla
+lite
+liten
+litet
+länge
+längre
+längst
+lätt
+lättare
+lättast
+långsam
+långsammare
+långsammast
+långsamt
+långt
+låt
+man
+med
+mej
+mellan
+men
+mer
+mera
+mest
+mig
+min
+mina
+mindre
+minst
+mitt
+mittemot
+mot
+mycket
+många
+måste
+möjlig
+möjligen
+möjligt
+möjligtvis
+ned
+nederst
+nedersta
+nedre
+nej
+ner
+ni
+nio
+nionde
+nittio
+nittionde
+nitton
+nittonde
+nog
+noll
+nr
+nu
+nummer
+när
+nästa
+någon
+någonting
+något
+några
+nån
+nånting
+nåt
+nödvändig
+nödvändiga
+nödvändigt
+nödvändigtvis
+och
+också
+ofta
+oftast
+olika
+olikt
+om
+oss
+på
+rakt
+redan
+rätt
+sa
+sade
+sagt
+samma
+sedan
+senare
+senast
+sent
+sex
+sextio
+sextionde
+sexton
+sextonde
+sig
+sin
+sina
+sist
+sista
+siste
+sitt
+sitta
+sju
+sjunde
+sjuttio
+sjuttionde
+sjutton
+sjuttonde
+själv
+sjätte
+ska
+skall
+skulle
+slutligen
+små
+smått
+snart
+som
+stor
+stora
+stort
+större
+störst
+säga
+säger
+sämre
+sämst
+så
+sådan
+sådana
+sådant
+ta
+tack
+tar
+tidig
+tidigare
+tidigast
+tidigt
+till
+tills
+tillsammans
+tio
+tionde
+tjugo
+tjugoen
+tjugoett
+tjugonde
+tjugotre
+tjugotvå
+tjungo
+tolfte
+tolv
+tre
+tredje
+trettio
+trettionde
+tretton
+trettonde
+två
+tvåhundra
+under
+upp
+ur
+ursäkt
+ut
+utan
+utanför
+ute
+va
+vad
+var
+vara
+varför
+varifrån
+varit
+varje
+varken
+vars
+varsågod
+vart
+vem
+vems
+verkligen
+vi
+vid
+vidare
+viktig
+viktigare
+viktigast
+viktigt
+vilka
+vilkas
+vilken
+vilket
+vill
+väl
+vänster
+vänstra
+värre
+vår
+våra
+vårt
+än
+ännu
+är
+även
+åt
+åtminstone
+åtta
+åttio
+åttionde
+åttonde
+över
+övermorgon
+överst
+övre
\ No newline at end of file
--- /dev/null
+akasema
+alikuwa
+alisema
+baada
+basi
+bila
+cha
+chini
+hadi
+hapo
+hata
+hivyo
+hiyo
+huku
+huo
+ili
+ilikuwa
+juu
+kama
+karibu
+katika
+kila
+kima
+kisha
+kubwa
+kutoka
+kuwa
+kwa
+kwamba
+kwenda
+kwenye
+la
+lakini
+mara
+mdogo
+mimi
+mkubwa
+mmoja
+moja
+muda
+mwenye
+na
+naye
+ndani
+ng
+ni
+nini
+nonkungu
+pamoja
+pia
+sana
+sasa
+sauti
+tafadhali
+tena
+tu
+vile
+wa
+wakati
+wake
+walikuwa
+wao
+watu
+wengine
+wote
+ya
+yake
+yangu
+yao
+yeye
+yule
+za
+zaidi
+zake
\ No newline at end of file
--- /dev/null
+กล่าว
+กว่า
+กัน
+กับ
+การ
+ก็
+ก่อน
+ขณะ
+ขอ
+ของ
+ขึ้น
+คง
+ครั้ง
+ความ
+คือ
+จะ
+จัด
+จาก
+จึง
+ช่วง
+ซึ่ง
+ดัง
+ด้วย
+ด้าน
+ตั้ง
+ตั้งแต่
+ตาม
+ต่อ
+ต่าง
+ต่างๆ
+ต้อง
+ถึง
+ถูก
+ถ้า
+ทั้ง
+ทั้งนี้
+ทาง
+ทำ
+ทำให้
+ที่
+ที่สุด
+ทุก
+นอกจาก
+นัก
+นั้น
+นำ
+นี้
+น่า
+บาง
+ผล
+ผ่าน
+พบ
+พร้อม
+มา
+มาก
+มี
+ยัง
+รวม
+ระหว่าง
+รับ
+ราย
+ร่วม
+ลง
+วัน
+ว่า
+สำหรับ
+สุด
+ส่ง
+ส่วน
+หนึ่ง
+หรือ
+หลัง
+หลังจาก
+หลาย
+หาก
+อยาก
+อยู่
+อย่าง
+ออก
+อะไร
+อาจ
+อีก
+เขา
+เข้า
+เคย
+เฉพาะ
+เช่น
+เดียว
+เดียวกัน
+เนื่องจาก
+เปิด
+เปิดเผย
+เป็น
+เป็นการ
+เพราะ
+เพื่อ
+เมื่อ
+เรา
+เริ่ม
+เลย
+เห็น
+เอง
+แต่
+แบบ
+แรก
+และ
+แล้ว
+แห่ง
+โดย
+ใน
+ให้
+ได้
+ไป
+ไม่
+ไว้
\ No newline at end of file
--- /dev/null
+akin
+aking
+ako
+alin
+am
+amin
+aming
+ang
+ano
+anumang
+apat
+at
+atin
+ating
+ay
+bababa
+bago
+bakit
+bawat
+bilang
+dahil
+dalawa
+dapat
+din
+dito
+doon
+gagawin
+gayunman
+ginagawa
+ginawa
+ginawang
+gumawa
+gusto
+habang
+hanggang
+hindi
+huwag
+iba
+ibaba
+ibabaw
+ibig
+ikaw
+ilagay
+ilalim
+ilan
+inyong
+isa
+isang
+itaas
+ito
+iyo
+iyon
+iyong
+ka
+kahit
+kailangan
+kailanman
+kami
+kanila
+kanilang
+kanino
+kanya
+kanyang
+kapag
+kapwa
+karamihan
+katiyakan
+katulad
+kaya
+kaysa
+ko
+kong
+kulang
+kumuha
+kung
+laban
+lahat
+lamang
+likod
+lima
+maaari
+maaaring
+maging
+mahusay
+makita
+marami
+marapat
+masyado
+may
+mayroon
+mga
+minsan
+mismo
+mula
+muli
+na
+nabanggit
+naging
+nagkaroon
+nais
+nakita
+namin
+napaka
+narito
+nasaan
+ng
+ngayon
+ni
+nila
+nilang
+nito
+niya
+niyang
+noon
+o
+pa
+paano
+pababa
+paggawa
+pagitan
+pagkakaroon
+pagkatapos
+palabas
+pamamagitan
+panahon
+pangalawa
+para
+paraan
+pareho
+pataas
+pero
+pumunta
+pumupunta
+sa
+saan
+sabi
+sabihin
+sarili
+sila
+sino
+siya
+tatlo
+tayo
+tulad
+tungkol
+una
+walang
\ No newline at end of file
--- /dev/null
+acaba
+acep
+adamakıllı
+adeta
+ait
+altmýþ
+altmış
+altý
+altı
+ama
+amma
+anca
+ancak
+arada
+artýk
+aslında
+aynen
+ayrıca
+az
+açıkça
+açıkçası
+bana
+bari
+bazen
+bazý
+bazı
+başkası
+baţka
+belki
+ben
+benden
+beni
+benim
+beri
+beriki
+beþ
+beş
+beţ
+bilcümle
+bile
+bin
+binaen
+binaenaleyh
+bir
+biraz
+birazdan
+birbiri
+birden
+birdenbire
+biri
+birice
+birileri
+birisi
+birkaç
+birkaçı
+birkez
+birlikte
+birçok
+birçoğu
+birþey
+birþeyi
+birşey
+birşeyi
+birţey
+bitevi
+biteviye
+bittabi
+biz
+bizatihi
+bizce
+bizcileyin
+bizden
+bize
+bizi
+bizim
+bizimki
+bizzat
+boşuna
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+buracıkta
+burada
+buradan
+burası
+böyle
+böylece
+böylecene
+böylelikle
+böylemesine
+böylesine
+büsbütün
+bütün
+cuk
+cümlesi
+da
+daha
+dahi
+dahil
+dahilen
+daima
+dair
+dayanarak
+de
+defa
+dek
+demin
+demincek
+deminden
+denli
+derakap
+derhal
+derken
+deđil
+değil
+değin
+diye
+diđer
+diğer
+diğeri
+doksan
+dokuz
+dolayı
+dolayısıyla
+doğru
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+elbet
+elbette
+elli
+emme
+en
+enikonu
+epey
+epeyce
+epeyi
+esasen
+esnasında
+etmesi
+etraflı
+etraflıca
+etti
+ettiği
+ettiğini
+evleviyetle
+evvel
+evvela
+evvelce
+evvelden
+evvelemirde
+evveli
+eđer
+eğer
+fakat
+filanca
+gah
+gayet
+gayetle
+gayri
+gayrı
+gelgelelim
+gene
+gerek
+gerçi
+geçende
+geçenlerde
+gibi
+gibilerden
+gibisinden
+gine
+göre
+gırla
+hakeza
+halbuki
+halen
+halihazırda
+haliyle
+handiyse
+hangi
+hangisi
+hani
+hariç
+hasebiyle
+hasılı
+hatta
+hele
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkes
+herkesin
+hiç
+hiçbir
+hiçbiri
+hoş
+hulasaten
+iken
+iki
+ila
+ile
+ilen
+ilgili
+ilk
+illa
+illaki
+imdi
+indinde
+inen
+insermi
+ise
+ister
+itibaren
+itibariyle
+itibarıyla
+iyi
+iyice
+iyicene
+için
+iş
+işte
+iţte
+kadar
+kaffesi
+kah
+kala
+kanýmca
+karşın
+katrilyon
+kaynak
+kaçı
+kelli
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kere
+kez
+keza
+kezalik
+keşke
+keţke
+ki
+kim
+kimden
+kime
+kimi
+kimisi
+kimse
+kimsecik
+kimsecikler
+külliyen
+kýrk
+kýsaca
+kırk
+kısaca
+lakin
+leh
+lütfen
+maada
+madem
+mademki
+mamafih
+mebni
+međer
+meğer
+meğerki
+meğerse
+milyar
+milyon
+mu
+mü
+mý
+mı
+nasýl
+nasıl
+nasılsa
+nazaran
+naşi
+ne
+neden
+nedeniyle
+nedenle
+nedense
+nerde
+nerden
+nerdeyse
+nere
+nerede
+nereden
+neredeyse
+neresi
+nereye
+netekim
+neye
+neyi
+neyse
+nice
+nihayet
+nihayetinde
+nitekim
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduklarını
+oldukça
+olduğu
+olduğunu
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+onca
+onculayın
+onda
+ondan
+onlar
+onlardan
+onlari
+onlarýn
+onları
+onların
+onu
+onun
+oracık
+oracıkta
+orada
+oradan
+oranca
+oranla
+oraya
+otuz
+oysa
+oysaki
+pek
+pekala
+peki
+pekçe
+peyderpey
+rağmen
+sadece
+sahi
+sahiden
+sana
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+sonra
+sonradan
+sonraları
+sonunda
+tabii
+tam
+tamam
+tamamen
+tamamıyla
+tarafından
+tek
+trilyon
+tüm
+var
+vardı
+vasıtasıyla
+ve
+velev
+velhasıl
+velhasılıkelam
+veya
+veyahut
+ya
+yahut
+yakinen
+yakında
+yakından
+yakınlarda
+yalnız
+yalnızca
+yani
+yapacak
+yapmak
+yaptı
+yaptıkları
+yaptığı
+yaptığını
+yapılan
+yapılması
+yapıyor
+yedi
+yeniden
+yenilerde
+yerine
+yetmiþ
+yetmiş
+yetmiţ
+yine
+yirmi
+yok
+yoksa
+yoluyla
+yüz
+yüzünden
+zarfında
+zaten
+zati
+zira
+çabuk
+çabukça
+çeşitli
+çok
+çokları
+çoklarınca
+çokluk
+çoklukla
+çokça
+çoğu
+çoğun
+çoğunca
+çoğunlukla
+çünkü
+öbür
+öbürkü
+öbürü
+önce
+önceden
+önceleri
+öncelikle
+öteki
+ötekisi
+öyle
+öylece
+öylelikle
+öylemesine
+öz
+üzere
+üç
+þey
+þeyden
+þeyi
+þeyler
+þu
+þuna
+þunda
+þundan
+þunu
+şayet
+şey
+şeyden
+şeyi
+şeyler
+şu
+şuna
+şuncacık
+şunda
+şundan
+şunlar
+şunları
+şunu
+şunun
+şura
+şuracık
+şuracıkta
+şurası
+şöyle
+ţayet
+ţimdi
+ţu
+ţöyle
\ No newline at end of file
--- /dev/null
+авжеж
+адже
+але
+б
+без
+був
+була
+були
+було
+бути
+більш
+вам
+вас
+весь
+вздовж
+ви
+вниз
+внизу
+вона
+вони
+воно
+все
+всередині
+всіх
+від
+він
+да
+давай
+давати
+де
+дещо
+для
+до
+з
+завжди
+замість
+й
+коли
+ледве
+майже
+ми
+навколо
+навіть
+нам
+от
+отже
+отож
+поза
+про
+під
+та
+так
+такий
+також
+те
+ти
+тобто
+тож
+тощо
+хоча
+це
+цей
+чи
+чого
+що
+як
+який
+якої
+є
+із
+інших
+їх
+її
\ No newline at end of file
--- /dev/null
+آئی
+آئے
+آج
+آخر
+آخرکبر
+آدهی
+آًب
+آٹھ
+آیب
+اة
+اخبزت
+اختتبم
+ادھر
+ارد
+اردگرد
+ارکبى
+اش
+اضتعوبل
+اضتعوبلات
+اضطرذ
+اضکب
+اضکی
+اضکے
+اطراف
+اغیب
+افراد
+الگ
+اور
+اوًچب
+اوًچبئی
+اوًچی
+اوًچے
+اى
+اً
+اًذر
+اًہیں
+اٹھبًب
+اپٌب
+اپٌے
+اچھب
+اچھی
+اچھے
+اکثر
+اکٹھب
+اکٹھی
+اکٹھے
+اکیلا
+اکیلی
+اکیلے
+اگرچہ
+اہن
+ایطے
+ایک
+ب
+ت
+تبزٍ
+تت
+تر
+ترتیت
+تریي
+تعذاد
+تن
+تو
+توبم
+توہی
+توہیں
+تٌہب
+تک
+تھب
+تھوڑا
+تھوڑی
+تھوڑے
+تھی
+تھے
+تیي
+ثب
+ثبئیں
+ثبترتیت
+ثبری
+ثبرے
+ثبعث
+ثبلا
+ثبلترتیت
+ثبہر
+ثدبئے
+ثرآں
+ثراں
+ثرش
+ثعذ
+ثغیر
+ثلٌذ
+ثلٌذوثبلا
+ثلکہ
+ثي
+ثٌب
+ثٌبرہب
+ثٌبرہی
+ثٌبرہے
+ثٌبًب
+ثٌذ
+ثٌذکرو
+ثٌذکرًب
+ثٌذی
+ثڑا
+ثڑوں
+ثڑی
+ثڑے
+ثھر
+ثھرا
+ثھراہوا
+ثھرپور
+ثھی
+ثہت
+ثہتر
+ثہتری
+ثہتریي
+ثیچ
+ج
+خب
+خبرہب
+خبرہی
+خبرہے
+خبهوظ
+خبًب
+خبًتب
+خبًتی
+خبًتے
+خبًٌب
+خت
+ختن
+خجکہ
+خص
+خططرذ
+خلذی
+خو
+خواى
+خوًہی
+خوکہ
+خٌبة
+خگہ
+خگہوں
+خگہیں
+خیطب
+خیطبکہ
+در
+درخبت
+درخہ
+درخے
+درزقیقت
+درضت
+دش
+دفعہ
+دلچطپ
+دلچطپی
+دلچطپیبں
+دو
+دور
+دوراى
+دوضرا
+دوضروں
+دوضری
+دوضرے
+دوًوں
+دکھبئیں
+دکھبتب
+دکھبتی
+دکھبتے
+دکھبو
+دکھبًب
+دکھبیب
+دی
+دیب
+دیتب
+دیتی
+دیتے
+دیر
+دیٌب
+دیکھو
+دیکھٌب
+دیکھی
+دیکھیں
+دے
+ر
+راضتوں
+راضتہ
+راضتے
+رریعہ
+رریعے
+رکي
+رکھ
+رکھب
+رکھتب
+رکھتبہوں
+رکھتی
+رکھتے
+رکھی
+رکھے
+رہب
+رہی
+رہے
+ز
+زبصل
+زبضر
+زبل
+زبلات
+زبلیہ
+زصوں
+زصہ
+زصے
+زقبئق
+زقیتیں
+زقیقت
+زکن
+زکویہ
+زیبدٍ
+صبف
+صسیر
+صفر
+صورت
+صورتسبل
+صورتوں
+صورتیں
+ض
+ضبت
+ضبتھ
+ضبدٍ
+ضبرا
+ضبرے
+ضبل
+ضبلوں
+ضت
+ضرور
+ضرورت
+ضروری
+ضلطلہ
+ضوچ
+ضوچب
+ضوچتب
+ضوچتی
+ضوچتے
+ضوچو
+ضوچٌب
+ضوچی
+ضوچیں
+ضکب
+ضکتب
+ضکتی
+ضکتے
+ضکٌب
+ضکی
+ضکے
+ضیذھب
+ضیذھی
+ضیذھے
+ضیکٌڈ
+ضے
+طرف
+طریق
+طریقوں
+طریقہ
+طریقے
+طور
+طورپر
+ظبہر
+ع
+عذد
+عظین
+علاقوں
+علاقہ
+علاقے
+علاوٍ
+عووهی
+غبیذ
+غخص
+غذ
+غروع
+غروعبت
+غے
+فرد
+فی
+ق
+قجل
+قجیلہ
+قطن
+لئے
+لا
+لازهی
+لو
+لوجب
+لوجی
+لوجے
+لوسبت
+لوسہ
+لوگ
+لوگوں
+لڑکپي
+لگتب
+لگتی
+لگتے
+لگٌب
+لگی
+لگیں
+لگے
+لی
+لیب
+لیٌب
+لیں
+لے
+ه
+هتعلق
+هختلف
+هسترم
+هسترهہ
+هسطوش
+هسیذ
+هطئلہ
+هطئلے
+هطبئل
+هطتعول
+هطلق
+هعلوم
+هػتول
+هلا
+هوکي
+هوکٌبت
+هوکٌہ
+هٌبضت
+هڑا
+هڑًب
+هڑے
+هکول
+هگر
+هہرثبى
+هیرا
+هیری
+هیرے
+هیں
+و
+وار
+والے
+وٍ
+ًئی
+ًئے
+ًب
+ًبپطٌذ
+ًبگسیر
+ًطجت
+ًقطہ
+ًو
+ًوخواى
+ًکبلٌب
+ًکتہ
+ًہ
+ًہیں
+ًیب
+ًے
+ٓ آش
+ٹھیک
+پبئے
+پبش
+پبًب
+پبًچ
+پر
+پراًب
+پطٌذ
+پل
+پورا
+پوچھب
+پوچھتب
+پوچھتی
+پوچھتے
+پوچھو
+پوچھوں
+پوچھٌب
+پوچھیں
+پچھلا
+پھر
+پہلا
+پہلی
+پہلےضی
+پہلےضے
+پہلےضےہی
+پیع
+چبر
+چبہب
+چبہٌب
+چبہے
+چلا
+چلو
+چلیں
+چلے
+چکب
+چکی
+چکیں
+چکے
+چھوٹب
+چھوٹوں
+چھوٹی
+چھوٹے
+چھہ
+چیسیں
+ڈھوًڈا
+ڈھوًڈلیب
+ڈھوًڈو
+ڈھوًڈًب
+ڈھوًڈی
+ڈھوًڈیں
+ک
+کئی
+کئے
+کب
+کبفی
+کبم
+کت
+کجھی
+کرا
+کرتب
+کرتبہوں
+کرتی
+کرتے
+کرتےہو
+کررہب
+کررہی
+کررہے
+کرو
+کرًب
+کریں
+کرے
+کطی
+کل
+کن
+کوئی
+کوتر
+کورا
+کوروں
+کورٍ
+کورے
+کوطي
+کوى
+کوًطب
+کوًطی
+کوًطے
+کھولا
+کھولو
+کھولٌب
+کھولی
+کھولیں
+کھولے
+کہ
+کہب
+کہتب
+کہتی
+کہتے
+کہو
+کہوں
+کہٌب
+کہی
+کہیں
+کہے
+کی
+کیب
+کیطب
+کیطرف
+کیطے
+کیلئے
+کیوًکہ
+کیوں
+کیے
+کے
+کےثعذ
+کےرریعے
+گئی
+گئے
+گب
+گرد
+گروٍ
+گروپ
+گروہوں
+گٌتی
+گی
+گیب
+گے
+ہر
+ہن
+ہو
+ہوئی
+ہوئے
+ہوا
+ہوبرا
+ہوبری
+ہوبرے
+ہوتب
+ہوتی
+ہوتے
+ہورہب
+ہورہی
+ہورہے
+ہوضکتب
+ہوضکتی
+ہوضکتے
+ہوًب
+ہوًی
+ہوًے
+ہوچکب
+ہوچکی
+ہوچکے
+ہوگئی
+ہوگئے
+ہوگیب
+ہوں
+ہی
+ہیں
+ہے
+ی
+یقیٌی
+یہ
+یہبں
\ No newline at end of file
--- /dev/null
+a ha
+a-lô
+ai
+ai ai
+ai nấy
+alô
+amen
+anh
+bao giờ
+bao lâu
+bao nhiêu
+bao nả
+bay biến
+biết
+biết bao
+biết bao nhiêu
+biết chừng nào
+biết mấy
+biết đâu
+biết đâu chừng
+biết đâu đấy
+bà
+bài
+bác
+bây bẩy
+bây chừ
+bây giờ
+bây nhiêu
+bèn
+béng
+bông
+bạn
+bản
+bất chợt
+bất cứ
+bất giác
+bất kì
+bất kể
+bất kỳ
+bất luận
+bất nhược
+bất quá
+bất thình lình
+bất tử
+bất đồ
+bấy
+bấy chầy
+bấy chừ
+bấy giờ
+bấy lâu
+bấy lâu nay
+bấy nay
+bấy nhiêu
+bập bà bập bõm
+bập bõm
+bắt đầu từ
+bằng
+bằng không
+bằng nấy
+bằng ấy
+bển
+bệt
+bị
+bỏ mẹ
+bỗng
+bỗng chốc
+bỗng dưng
+bỗng không
+bỗng nhiên
+bỗng đâu
+bộ
+bội phần
+bớ
+bởi
+bởi chưng
+bởi nhưng
+bởi thế
+bởi vì
+bởi vậy
+bức
+cao
+cha
+cha chả
+chao ôi
+chiếc
+cho
+cho nên
+cho tới
+cho tới khi
+cho đến
+cho đến khi
+choa
+chu cha
+chui cha
+chung cục
+chung qui
+chung quy
+chung quy lại
+chuyện
+chành chạnh
+chí chết
+chính
+chính là
+chính thị
+chùn chùn
+chùn chũn
+chú
+chú mày
+chú mình
+chúng mình
+chúng ta
+chúng tôi
+chăn chắn
+chăng
+chưa
+chầm chập
+chậc
+chắc
+chắc hẳn
+chẳng lẽ
+chẳng những
+chẳng nữa
+chẳng phải
+chết nỗi
+chết thật
+chết tiệt
+chỉ
+chỉn
+chốc chốc
+chớ
+chớ chi
+chợt
+chủn
+chứ
+chứ lị
+coi bộ
+coi mòi
+con
+cu cậu
+cuốn
+cuộc
+càng
+các
+cái
+cây
+còn
+có
+có chăng là
+có dễ
+có thể
+có vẻ
+cóc khô
+cô
+cô mình
+công nhiên
+cùng
+cùng cực
+cùng nhau
+cùng với
+căn
+căn cắt
+cũng
+cũng như
+cũng vậy
+cũng vậy thôi
+cơ
+cơ chừng
+cơ hồ
+cơ mà
+cơn
+cả
+cả thảy
+cả thể
+cảm ơn
+cần
+cật lực
+cật sức
+cậu
+cổ lai
+của
+cứ
+cứ việc
+cực lực
+do
+do vì
+do vậy
+do đó
+duy
+dào
+dì
+dù cho
+dù rằng
+dưới
+dạ
+dần dà
+dần dần
+dầu sao
+dẫu
+dẫu sao
+dễ sợ
+dễ thường
+dở chừng
+dữ
+em
+giữa
+gì
+hay
+hoàn toàn
+hoặc
+hơn
+hầu hết
+họ
+hỏi
+khi
+khác
+không
+luôn
+là
+làm
+lên
+lúc
+lại
+lần
+lớn
+muốn
+mà
+mình
+mỗi
+một
+một cách
+mới
+mợ
+ngay
+ngay cả
+ngay khi
+ngay lúc
+ngay lập tức
+ngay tức khắc
+ngay từ
+nghe chừng
+nghe đâu
+nghen
+nghiễm nhiên
+nghỉm
+ngoài
+ngoài ra
+ngoải
+ngày
+ngày càng
+ngày ngày
+ngày xưa
+ngày xửa
+ngôi
+ngõ hầu
+ngăn ngắt
+ngươi
+người
+ngọn
+ngọt
+ngộ nhỡ
+nh
+nhau
+nhiên hậu
+nhiều
+nhiệt liệt
+nhung nhăng
+nhà
+nhân dịp
+nhân tiện
+nhé
+nhón nhén
+như
+như chơi
+như không
+như quả
+như thể
+như tuồng
+như vậy
+nhưng
+nhưng mà
+nhược bằng
+nhất
+nhất loạt
+nhất luật
+nhất mực
+nhất nhất
+nhất quyết
+nhất sinh
+nhất thiết
+nhất tâm
+nhất tề
+nhất đán
+nhất định
+nhận
+nhỉ
+nhỡ ra
+những
+những ai
+những như
+nào
+này
+nên
+nên chi
+nó
+nóc
+nói
+năm
+nơi
+nấy
+nếu
+nếu như
+nền
+nọ
+nớ
+nức nở
+nữa
+oai oái
+oái
+pho
+phè
+phóc
+phót
+phăn phắt
+phương chi
+phải
+phải chi
+phải chăng
+phắt
+phỉ phui
+phỏng
+phỏng như
+phốc
+phụt
+phứt
+qua
+qua quít
+qua quýt
+quyết
+quyết nhiên
+quyển
+quá
+quá chừng
+quá lắm
+quá sá
+quá thể
+quá trời
+quá xá
+quá đỗi
+quá độ
+quá ư
+quý hồ
+quả
+quả là
+quả tang
+quả thật
+quả tình
+quả vậy
+quả đúng
+ra
+ra phết
+ra sao
+ra trò
+ren rén
+riu ríu
+riêng
+riệt
+rày
+ráo
+ráo trọi
+rén
+rích
+rón rén
+rút cục
+răng
+rất
+rằng
+rằng là
+rốt cuộc
+rốt cục
+rồi
+rứa
+sa sả
+sao
+sau
+sau chót
+sau cuối
+sau cùng
+sau đó
+so
+song le
+suýt
+sì
+sạch
+sất
+sắp
+sẽ
+số
+số là
+sốt sột
+sở dĩ
+sự
+tanh
+tha hồ
+than ôi
+thanh
+theo
+thi thoảng
+thoạt
+thoạt nhiên
+thoắt
+thuần
+thà
+thà là
+thà rằng
+thành ra
+thành thử
+thái quá
+tháng
+thì
+thì thôi
+thình lình
+thím
+thôi
+thúng thắng
+thương ôi
+thường
+thảo hèn
+thảo nào
+thấy
+thẩy
+thậm
+thậm chí
+thật lực
+thật ra
+thật vậy
+thế
+thế là
+thế mà
+thế nào
+thế nên
+thế ra
+thế thì
+thế à
+thếch
+thỉnh thoảng
+thỏm
+thốc
+thốc tháo
+thốt
+thốt nhiên
+thộc
+thời gian
+thục mạng
+thửa
+thực ra
+thực sự
+thực vậy
+tiếp theo
+tiếp đó
+tiện thể
+toà
+toé khói
+toẹt
+trong
+trên
+trước
+trước kia
+trước nay
+trước tiên
+trước đây
+trước đó
+trếu tráo
+trển
+trệt
+trệu trạo
+trỏng
+trời đất ơi
+trừ phi
+tuy
+tuy nhiên
+tuy rằng
+tuy thế
+tuy vậy
+tuyệt nhiên
+tuần tự
+tuốt luốt
+tuốt tuồn tuột
+tuốt tuột
+tà tà
+tênh
+tít mù
+tò te
+tôi
+tông tốc
+tù tì
+tăm tắp
+tại
+tại vì
+tấm
+tấn
+tất cả
+tất thảy
+tất tần tật
+tất tật
+tắp
+tắp lự
+tọt
+tỏ ra
+tỏ vẻ
+tốc tả
+tối ư
+tột
+tớ
+tới
+tức thì
+tức tốc
+từ
+từng
+tự vì
+tựu trung
+veo
+veo veo
+việc
+vung thiên địa
+vung tàn tán
+vung tán tàn
+và
+vào
+vâng
+vèo
+vì
+vì chưng
+vì thế
+vì vậy
+ví bằng
+ví dù
+ví phỏng
+ví thử
+vô hình trung
+vô kể
+vô luận
+vô vàn
+văng tê
+vạn nhất
+vả chăng
+vả lại
+vẫn
+vậy
+vậy là
+vậy thì
+về
+vị tất
+vốn dĩ
+với
+với lại
+vở
+vụt
+vừa
+vừa mới
+xa xả
+xiết bao
+xon xón
+xoành xoạch
+xoét
+xoẳn
+xoẹt
+xuất kì bất ý
+xuất kỳ bất ý
+xuể
+xuống
+xăm xúi
+xăm xăm
+xăm xắm
+xềnh xệch
+xệp
+à
+à ơi
+ào
+á
+á à
+ái
+ái chà
+ái dà
+áng
+âu là
+ô hay
+ô hô
+ô kê
+ô kìa
+ôi chao
+ôi thôi
+ông
+úi
+úi chà
+úi dào
+ý
+ý chừng
+ý da
+đang
+đi
+điều
+đành đạch
+đáng lí
+đáng lý
+đáng lẽ
+đánh đùng
+đáo để
+đây
+đã
+đó
+được
+đại loại
+đại nhân
+đại phàm
+đại để
+đến
+đến nỗi
+đều
+để
+ơ
+ơ hay
+ơ kìa
+ơi
+ư
+ạ
+ạ ơi
+ấy
+ầu ơ
+ắt
+ắt hẳn
+ắt là
+ối dào
+ối giời
+ối giời ơi
+ồ
+ổng
+ớ
+ờ
+ở
+ở trên
+ủa
+ứ hự
+ứ ừ
+ừ
+ử
\ No newline at end of file
--- /dev/null
+a
+an
+bá
+bí
+bẹ̀rẹ̀
+fún
+fẹ́
+gbogbo
+inú
+jù
+jẹ
+jẹ́
+kan
+kì
+kí
+kò
+láti
+lè
+lọ
+mi
+mo
+máa
+mọ̀
+ni
+náà
+ní
+nígbà
+nítorí
+nǹkan
+o
+padà
+pé
+púpọ̀
+pẹ̀lú
+rẹ̀
+sì
+sí
+sínú
+ṣ
+ti
+tí
+wà
+wá
+wọn
+wọ́n
+yìí
+àti
+àwọn
+é
+í
+òun
+ó
+ń
+ńlá
+ṣe
+ṣé
+ṣùgbọ́n
+ẹmọ́
+ọjọ́
+ọ̀pọ̀lọpọ̀
\ No newline at end of file
--- /dev/null
+、
+。
+〈
+〉
+《
+》
+一
+一个
+一些
+一何
+一切
+一则
+一方面
+一旦
+一来
+一样
+一种
+一般
+一转眼
+七
+万一
+三
+上
+上下
+下
+不
+不仅
+不但
+不光
+不单
+不只
+不外乎
+不如
+不妨
+不尽
+不尽然
+不得
+不怕
+不惟
+不成
+不拘
+不料
+不是
+不比
+不然
+不特
+不独
+不管
+不至于
+不若
+不论
+不过
+不问
+与
+与其
+与其说
+与否
+与此同时
+且
+且不说
+且说
+两者
+个
+个别
+中
+临
+为
+为了
+为什么
+为何
+为止
+为此
+为着
+乃
+乃至
+乃至于
+么
+之
+之一
+之所以
+之类
+乌乎
+乎
+乘
+九
+也
+也好
+也罢
+了
+二
+二来
+于
+于是
+于是乎
+云云
+云尔
+五
+些
+亦
+人
+人们
+人家
+什
+什么
+什么样
+今
+介于
+仍
+仍旧
+从
+从此
+从而
+他
+他人
+他们
+他们们
+以
+以上
+以为
+以便
+以免
+以及
+以故
+以期
+以来
+以至
+以至于
+以致
+们
+任
+任何
+任凭
+会
+似的
+但
+但凡
+但是
+何
+何以
+何况
+何处
+何时
+余外
+作为
+你
+你们
+使
+使得
+例如
+依
+依据
+依照
+便于
+俺
+俺们
+倘
+倘使
+倘或
+倘然
+倘若
+借
+借傥然
+假使
+假如
+假若
+做
+像
+儿
+先不先
+光
+光是
+全体
+全部
+八
+六
+兮
+共
+关于
+关于具体地说
+其
+其一
+其中
+其二
+其他
+其余
+其它
+其次
+具体地说
+具体说来
+兼之
+内
+再
+再其次
+再则
+再有
+再者
+再者说
+再说
+冒
+冲
+况且
+几
+几时
+凡
+凡是
+凭
+凭借
+出于
+出来
+分
+分别
+则
+则甚
+别
+别人
+别处
+别是
+别的
+别管
+别说
+到
+前后
+前此
+前者
+加之
+加以
+区
+即
+即令
+即使
+即便
+即如
+即或
+即若
+却
+去
+又
+又及
+及
+及其
+及至
+反之
+反而
+反过来
+反过来说
+受到
+另
+另一方面
+另外
+另悉
+只
+只当
+只怕
+只是
+只有
+只消
+只要
+只限
+叫
+叮咚
+可
+可以
+可是
+可见
+各
+各个
+各位
+各种
+各自
+同
+同时
+后
+后者
+向
+向使
+向着
+吓
+吗
+否则
+吧
+吧哒
+含
+吱
+呀
+呃
+呕
+呗
+呜
+呜呼
+呢
+呵
+呵呵
+呸
+呼哧
+咋
+和
+咚
+咦
+咧
+咱
+咱们
+咳
+哇
+哈
+哈哈
+哉
+哎
+哎呀
+哎哟
+哗
+哟
+哦
+哩
+哪
+哪个
+哪些
+哪儿
+哪天
+哪年
+哪怕
+哪样
+哪边
+哪里
+哼
+哼唷
+唉
+唯有
+啊
+啐
+啥
+啦
+啪达
+啷当
+喂
+喏
+喔唷
+喽
+嗡
+嗡嗡
+嗬
+嗯
+嗳
+嘎
+嘎登
+嘘
+嘛
+嘻
+嘿
+嘿嘿
+四
+因
+因为
+因了
+因此
+因着
+因而
+固然
+在
+在下
+在于
+地
+基于
+处在
+多
+多么
+多少
+大
+大家
+她
+她们
+好
+如
+如上
+如上所述
+如下
+如何
+如其
+如同
+如是
+如果
+如此
+如若
+始而
+孰料
+孰知
+宁
+宁可
+宁愿
+宁肯
+它
+它们
+对
+对于
+对待
+对方
+对比
+将
+小
+尔
+尔后
+尔尔
+尚且
+就
+就是
+就是了
+就是说
+就算
+就要
+尽
+尽管
+尽管如此
+岂但
+己
+已
+已矣
+巴
+巴巴
+年
+并
+并且
+庶乎
+庶几
+开外
+开始
+归
+归齐
+当
+当地
+当然
+当着
+彼
+彼时
+彼此
+往
+待
+很
+得
+得了
+怎
+怎么
+怎么办
+怎么样
+怎奈
+怎样
+总之
+总的来看
+总的来说
+总的说来
+总而言之
+恰恰相反
+您
+惟其
+慢说
+我
+我们
+或
+或则
+或是
+或曰
+或者
+截至
+所
+所以
+所在
+所幸
+所有
+才
+才能
+打
+打从
+把
+抑或
+拿
+按
+按照
+换句话说
+换言之
+据
+据此
+接着
+故
+故此
+故而
+旁人
+无
+无宁
+无论
+既
+既往
+既是
+既然
+日
+时
+时候
+是
+是以
+是的
+更
+曾
+替
+替代
+最
+月
+有
+有些
+有关
+有及
+有时
+有的
+望
+朝
+朝着
+本
+本人
+本地
+本着
+本身
+来
+来着
+来自
+来说
+极了
+果然
+果真
+某
+某个
+某些
+某某
+根据
+欤
+正值
+正如
+正巧
+正是
+此
+此地
+此处
+此外
+此时
+此次
+此间
+毋宁
+每
+每当
+比
+比及
+比如
+比方
+没奈何
+沿
+沿着
+漫说
+点
+焉
+然则
+然后
+然而
+照
+照着
+犹且
+犹自
+甚且
+甚么
+甚或
+甚而
+甚至
+甚至于
+用
+用来
+由
+由于
+由是
+由此
+由此可见
+的
+的确
+的话
+直到
+相对而言
+省得
+看
+眨眼
+着
+着呢
+矣
+矣乎
+矣哉
+离
+秒
+称
+竟而
+第
+等
+等到
+等等
+简言之
+管
+类如
+紧接着
+纵
+纵令
+纵使
+纵然
+经
+经过
+结果
+给
+继之
+继后
+继而
+综上所述
+罢了
+者
+而
+而且
+而况
+而后
+而外
+而已
+而是
+而言
+能
+能否
+腾
+自
+自个儿
+自从
+自各儿
+自后
+自家
+自己
+自打
+自身
+至
+至于
+至今
+至若
+致
+般的
+若
+若夫
+若是
+若果
+若非
+莫不然
+莫如
+莫若
+虽
+虽则
+虽然
+虽说
+被
+要
+要不
+要不是
+要不然
+要么
+要是
+譬喻
+譬如
+让
+许多
+论
+设使
+设或
+设若
+诚如
+诚然
+该
+说
+说来
+请
+诸
+诸位
+诸如
+谁
+谁人
+谁料
+谁知
+贼死
+赖以
+赶
+起
+起见
+趁
+趁着
+越是
+距
+跟
+较
+较之
+边
+过
+还
+还是
+还有
+还要
+这
+这一来
+这个
+这么
+这么些
+这么样
+这么点儿
+这些
+这会儿
+这儿
+这就是说
+这时
+这样
+这次
+这般
+这边
+这里
+进而
+连
+连同
+逐步
+通过
+遵循
+遵照
+那
+那个
+那么
+那么些
+那么样
+那些
+那会儿
+那儿
+那时
+那样
+那般
+那边
+那里
+都
+鄙人
+鉴于
+针对
+阿
+除
+除了
+除外
+除开
+除此之外
+除非
+随
+随后
+随时
+随着
+难道说
+零
+非
+非但
+非徒
+非特
+非独
+靠
+顺
+顺着
+首先
+︿
+!
+#
+$
+%
+&
+(
+)
+*
++
+,
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+:
+;
+<
+>
+?
+@
+[
+]
+{
+|
+}
+~
+¥
\ No newline at end of file
--- /dev/null
+futhi
+kahle
+kakhulu
+kanye
+khona
+kodwa
+kungani
+kusho
+la
+lakhe
+lapho
+mina
+ngesikhathi
+nje
+phansi
+phezulu
+u
+ukuba
+ukuthi
+ukuze
+uma
+wahamba
+wakhe
+wami
+wase
+wathi
+yakhe
+zakhe
+zonke
\ No newline at end of file
--- /dev/null
+[wrap-file]
+directory = googletest-release-1.8.1
+
+source_url = https://github.com/google/googletest/archive/release-1.8.1.zip
+source_filename = gtest-1.8.1.zip
+source_hash = 927827c183d01734cc5cfef85e0ff3f5a92ffe6188e0d18e909c5efebf28a0c7
+
+patch_url = https://wrapdb.mesonbuild.com/v1/projects/gtest/1.8.1/1/get_zip
+patch_filename = gtest-1.8.1-1-wrap.zip
+patch_hash = f79f5fd46e09507b3f2e09a51ea6eb20020effe543335f5aee59f30cc8d15805
--- /dev/null
+[wrap-file]
+directory = xz-5.2.1
+
+source_url = http://tukaani.org/xz/xz-5.2.1.tar.xz
+source_filename = xz-5.2.1.tar.xz
+source_hash = 6ecdd4d80b12001497df0741d6037f918d270fa0f9a1ab4e2664bf4157ae323c
+
+patch_url = https://mirror.download.kiwix.org/dev/xz-5.2.1-wrap.zip
+patch_filename = xz-5.2.1-wrap.zip
+patch_hash = 782a4e56bcc26ebda18041a05f2f85dce70284109a5ce99ea960c6b4432a99e9
+
+[provide]
+liblzma = lzma_dep
--- /dev/null
+[wrap-file]
+directory = zstd-1.4.5
+source_url = https://github.com/facebook/zstd/releases/download/v1.4.5/zstd-1.4.5.tar.gz
+source_filename = zstd-1.4.5.tar.gz
+source_hash = 98e91c7c6bf162bf90e4e70fdbc41a8188b9fa8de5ad840c401198014406ce9e
+
+patch_url = https://mirror.download.kiwix.org/dev/zstd-1.4.5-wrap.zip
+patch_filename = zstd-1.4.5-wrap.zip
+patch_hash = 4462693b58939b61ab76c5e5597343ab156eb0681b60a77908d2b88e17dca7cc
+
+[provide]
+libzstd = libzstd_dep
+
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+#include <zim/zim.h>
+#include <zim/archive.h>
+#include <zim/item.h>
+#include <zim/search.h>
+#include <zim/suggestion.h>
+#include <zim/error.h>
+
+#include <zim/writer/creator.h>
+
+#include "tools.h"
+#include "../src/fs.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using zim::unittests::makeTempFile;
+using zim::unittests::getDataFilePath;
+using zim::unittests::TempFile;
+using zim::unittests::TestItem;
+using zim::unittests::IsFrontArticle;
+
+using TestContextImpl = std::vector<std::pair<std::string, std::string> >;
+struct TestContext : TestContextImpl {
+ TestContext(const std::initializer_list<value_type>& il)
+ : TestContextImpl(il)
+ {}
+};
+
+std::ostream& operator<<(std::ostream& out, const TestContext& ctx)
+{
+ out << "Test context:\n";
+ for ( const auto& kv : ctx )
+ out << "\t" << kv.first << ": " << kv.second << "\n";
+ out << std::endl;
+ return out;
+}
+
+std::string
+emptyZimArchiveContent()
+{
+ std::string content;
+ content += "ZIM\x04"; // Magic
+ content += "\x05" + std::string(3, '\0'); // Version
+ content += std::string(16, '\0'); // uuid
+ content += std::string(4, '\0'); // article count
+ content += std::string(4, '\0'); // cluster count
+ content += "\x51" + std::string(7, '\0'); // url ptr pos
+ content += "\x51" + std::string(7, '\0'); // title ptr pos
+ content += "\x51" + std::string(7, '\0'); // cluster ptr pos
+ content += "\x50" + std::string(7, '\0'); // mimelist ptr pos
+ content += std::string(4, '\0'); // main page index
+ content += std::string(4, '\0'); // layout page index
+ content += "\x51" + std::string(7, '\0'); // checksum pos
+ content += std::string(1, '\0');; // (empty) mimelist
+ content += "\x9f\x3e\xcd\x95\x46\xf6\xc5\x3b\x35\xb4\xc6\xd4\xc0\x8e\xd0\x66"; // md5sum
+ return content;
+}
+
+TEST(ZimArchive, openingAnInvalidZimArchiveFails)
+{
+ const char* const prefixes[] = { "ZIM\x04", "" };
+ const unsigned char bytes[] = {0x00, 0x01, 0x11, 0x30, 0xFF};
+ for ( const std::string prefix : prefixes ) {
+ for ( const unsigned char byte : bytes ) {
+ for ( int count = 0; count < 100; count += 10 ) {
+ const TestContext ctx{
+ {"prefix", prefix.size() ? "yes" : "no" },
+ {"byte", zim::unittests::to_string(byte) },
+ {"count", zim::unittests::to_string(count) }
+ };
+ const std::string zimfileContent = prefix + std::string(count, byte);
+ const auto tmpfile = makeTempFile("invalid_zim_file", zimfileContent);
+
+ EXPECT_THROW( zim::Archive(tmpfile->path()), std::runtime_error ) << ctx;
+ }
+ }
+ }
+}
+
+TEST(ZimArchive, openingAnEmptyZimArchiveSucceeds)
+{
+ const auto tmpfile = makeTempFile("empty_zim_file", emptyZimArchiveContent());
+
+ zim::Archive archive(tmpfile->path());
+ ASSERT_TRUE(archive.check());
+}
+
+bool isNastyOffset(int offset) {
+ if ( 6 <= offset && offset < 24 ) // Minor version or uuid
+ return false;
+
+ if ( 64 <= offset && offset < 72 ) // page or layout index
+ return false;
+
+ return true;
+}
+
+TEST(ZimArchive, nastyEmptyZimArchive)
+{
+ const std::string correctContent = emptyZimArchiveContent();
+ for ( int offset = 0; offset < 80; ++offset ) {
+ if ( isNastyOffset(offset) ) {
+ const TestContext ctx{ {"offset", zim::unittests::to_string(offset) } };
+ std::string nastyContent(correctContent);
+ nastyContent[offset] = '\xff';
+ const auto tmpfile = makeTempFile("wrong_checksum_empty_zim_file", nastyContent);
+ EXPECT_THROW( zim::Archive(tmpfile->path()), std::runtime_error ) << ctx;
+ }
+ }
+}
+
+TEST(ZimArchive, wrongChecksumInEmptyZimArchive)
+{
+ std::string zimfileContent = emptyZimArchiveContent();
+ zimfileContent[85] = '\xff';
+ const auto tmpfile = makeTempFile("wrong_checksum_empty_zim_file", zimfileContent);
+
+ zim::Archive archive(tmpfile->path());
+ ASSERT_FALSE(archive.check());
+}
+
+
+TEST(ZimArchive, openCreatedArchive)
+{
+ TempFile temp("zimfile");
+ auto tempPath = temp.path();
+ zim::Uuid uuid;
+ // Force special char in the uuid to be sure they are not handled particularly.
+ uuid.data[5] = '\n';
+ uuid.data[10] = '\0';
+
+ zim::writer::Creator creator;
+ creator.setUuid(uuid);
+ creator.configIndexing(true, "eng");
+ creator.startZimCreation(tempPath);
+ auto item = std::make_shared<TestItem>("foo", "text/html", "Foo", "FooContent", IsFrontArticle::YES);
+ creator.addItem(item);
+ // Be sure that title order is not the same that url order
+ item = std::make_shared<TestItem>("foo2", "text/html", "AFoo", "Foo2Content", IsFrontArticle::NO);
+ creator.addItem(item);
+ creator.addMetadata("Title", "This is a title");
+ creator.addIllustration(48, "PNGBinaryContent48");
+ creator.addIllustration(96, "PNGBinaryContent96");
+ creator.setMainPath("foo");
+ creator.addRedirection("foo3", "FooRedirection", "foo"); // No a front article.
+ creator.addRedirection("foo4", "FooRedirection", "NoExistant"); // Invalid redirection, must be removed by creator
+ creator.finishZimCreation();
+
+ zim::Archive archive(tempPath);
+#if !defined(ENABLE_XAPIAN)
+// 2*listingIndex + M/Counter + M/Title + mainpage + 2*Illustration + 2*Item + redirection
+#define ALL_ENTRY_COUNT 10
+#else
+// same as above + 2 xapian indexes.
+#define ALL_ENTRY_COUNT 12
+#endif
+ ASSERT_EQ(archive.getAllEntryCount(), ALL_ENTRY_COUNT);
+#undef ALL_ENTRY_COUNT
+ ASSERT_EQ(archive.getEntryCount(), 3);
+ ASSERT_EQ(archive.getArticleCount(), 1);
+ ASSERT_EQ(archive.getUuid(), uuid);
+ ASSERT_EQ(archive.getMetadataKeys(), std::vector<std::string>({"Counter", "Illustration_48x48@1", "Illustration_96x96@1", "Title"}));
+ ASSERT_EQ(archive.getIllustrationSizes(), std::set<unsigned int>({48, 96}));
+ ASSERT_TRUE(archive.hasMainEntry());
+
+ ASSERT_EQ(archive.getMetadata("Title"), "This is a title");
+ auto titleMeta = archive.getMetadataItem("Title");
+ ASSERT_EQ(std::string(titleMeta.getData()), "This is a title");
+ ASSERT_EQ(titleMeta.getMimetype(), "text/plain;charset=utf-8");
+ ASSERT_EQ(archive.getMetadata("Counter"), "text/html=2");
+ auto illu48 = archive.getIllustrationItem(48);
+ ASSERT_EQ(illu48.getPath(), "Illustration_48x48@1");
+ ASSERT_EQ(std::string(illu48.getData()), "PNGBinaryContent48");
+ auto illu48Meta = archive.getMetadataItem(illu48.getPath());
+ ASSERT_EQ(std::string(illu48Meta.getData()), "PNGBinaryContent48");
+ ASSERT_EQ(illu48Meta.getMimetype(), "image/png");
+ auto illu96 = archive.getIllustrationItem(96);
+ ASSERT_EQ(illu96.getPath(), "Illustration_96x96@1");
+ ASSERT_EQ(std::string(illu96.getData()), "PNGBinaryContent96");
+
+ auto foo = archive.getEntryByPath("foo");
+ ASSERT_EQ(foo.getPath(), "foo");
+ ASSERT_EQ(foo.getTitle(), "Foo");
+ ASSERT_EQ(std::string(foo.getItem().getData()), "FooContent");
+
+ auto foo2 = archive.getEntryByPath("foo2");
+ ASSERT_EQ(foo2.getPath(), "foo2");
+ ASSERT_EQ(foo2.getTitle(), "AFoo");
+ ASSERT_EQ(std::string(foo2.getItem().getData()), "Foo2Content");
+
+ auto foo3 = archive.getEntryByPath("foo3");
+ ASSERT_EQ(foo3.getPath(), "foo3");
+ ASSERT_EQ(foo3.getTitle(), "FooRedirection");
+ ASSERT_TRUE(foo3.isRedirect());
+ ASSERT_EQ(foo3.getRedirectEntry().getIndex(), foo.getIndex());
+
+ auto main = archive.getMainEntry();
+ ASSERT_TRUE(main.isRedirect());
+ ASSERT_EQ(main.getRedirectEntry().getIndex(), foo.getIndex());
+ ASSERT_EQ(archive.getMainEntryIndex(), main.getIndex());
+}
+
+#if WITH_TEST_DATA
+TEST(ZimArchive, openRealZimArchive)
+{
+ const char* const zimfiles[] = {
+ "small.zim",
+ "wikibooks_be_all_nopic_2017-02.zim",
+ "wikibooks_be_all_nopic_2017-02_splitted.zim",
+ "wikipedia_en_climate_change_nopic_2020-01.zim"
+ };
+
+ for ( const std::string fname : zimfiles ) {
+ for (auto& testfile: getDataFilePath(fname)) {
+ const TestContext ctx{ {"path", testfile.path } };
+ std::unique_ptr<zim::Archive> archive;
+ EXPECT_NO_THROW( archive.reset(new zim::Archive(testfile.path)) ) << ctx;
+ if ( archive ) {
+ EXPECT_TRUE( archive->check() ) << ctx;
+ }
+ }
+ }
+}
+
+TEST(ZimArchive, randomEntry)
+{
+ const char* const zimfiles[] = {
+ "wikibooks_be_all_nopic_2017-02.zim",
+ "wikibooks_be_all_nopic_2017-02_splitted.zim",
+ "wikipedia_en_climate_change_nopic_2020-01.zim"
+ };
+
+ for ( const std::string fname : zimfiles ) {
+ for (auto& testfile: getDataFilePath(fname)) {
+ const TestContext ctx{ {"path", testfile.path } };
+ const zim::Archive archive(testfile.path);
+ try {
+ auto randomEntry = archive.getRandomEntry();
+ const auto item = randomEntry.getItem(true);
+ ASSERT_TRUE(item.getMimetype().find("text/html") != std::string::npos) << ctx;
+ } catch (zim::EntryNotFound& e) {
+ FAIL() << "Impossible to find a random Entry in " << fname << ".\n"
+ << "This may occur even if this is not a bug (random will be random).\n"
+ << "Please re-run the tests.";
+ }
+ }
+ }
+}
+
+TEST(ZimArchive, illustration)
+{
+ const char* const zimfiles[] = {
+ "small.zim",
+ "wikibooks_be_all_nopic_2017-02.zim"
+ };
+
+ for ( const std::string fname : zimfiles ) {
+ for (auto& testfile: getDataFilePath(fname)) {
+ const TestContext ctx{ {"path", testfile.path } };
+ const zim::Archive archive(testfile.path);
+ ASSERT_TRUE(archive.hasIllustration(48)) << ctx;
+ auto illustrationItem = archive.getIllustrationItem(48);
+ if(testfile.category == "nons") {
+ ASSERT_EQ(illustrationItem.getPath(), "Illustration_48x48@1") << ctx;
+ } else {
+ ASSERT_EQ(illustrationItem.getPath(), "I/favicon.png") << ctx;
+ }
+ ASSERT_EQ(archive.getIllustrationSizes(), std::set<unsigned int>({48}));
+ }
+ }
+}
+
+struct ZimFileInfo {
+ zim::entry_index_type articleCount, entryCount, allEntryCount;
+};
+
+struct TestDataInfo {
+ const char* const name;
+ ZimFileInfo withnsInfo, nonsInfo;
+
+
+ const ZimFileInfo& getZimFileInfo(const std::string& category) const {
+ if (category == "nons") {
+ return nonsInfo;
+ } else if (category == "withns") {
+ return withnsInfo;
+ }
+ throw std::runtime_error("Unknown category");
+ }
+};
+
+TEST(ZimArchive, articleNumber)
+{
+ TestDataInfo zimfiles[] = {
+ // Name withns nons
+ // {articles, userEntries, allEntries}, {articles, userEntries, allEntries}
+ {"small.zim", { 1, 17, 17 }, { 1, 2, 16 }},
+ {"wikibooks_be_all_nopic_2017-02.zim", { 70, 118, 118}, { 66, 109, 123 }},
+ {"wikibooks_be_all_nopic_2017-02_splitted.zim", { 70, 118, 118}, { 66, 109, 123 }},
+ {"wikipedia_en_climate_change_nopic_2020-01.zim", { 7253, 7646, 7646}, { 1837, 7633, 7649 }}
+ };
+ // "withns" zim files have no notion of user entries, so EntryCount == allEntryCount.
+ // for small.zim, there is always 1 article, whatever the article is in 'A' namespace or in specific index.
+
+ for ( const auto& testdata : zimfiles ) {
+ for (auto& testfile: getDataFilePath(testdata.name)) {
+ const TestContext ctx{ {"path", testfile.path } };
+ const auto& testZimInfo = testdata.getZimFileInfo(testfile.category);
+ const zim::Archive archive(testfile.path);
+ EXPECT_EQ( archive.getAllEntryCount(), testZimInfo.allEntryCount ) << ctx;
+ EXPECT_EQ( archive.getEntryCount(), testZimInfo.entryCount ) << ctx;
+ EXPECT_EQ( archive.getArticleCount(), testZimInfo.articleCount ) << ctx;
+ }
+ }
+}
+#endif
+
+class CapturedStderr
+{
+ std::ostringstream buffer;
+ std::streambuf* const sbuf;
+public:
+ CapturedStderr()
+ : sbuf(std::cerr.rdbuf())
+ {
+ std::cerr.rdbuf(buffer.rdbuf());
+ }
+
+ CapturedStderr(const CapturedStderr&) = delete;
+
+ ~CapturedStderr()
+ {
+ std::cerr.rdbuf(sbuf);
+ }
+
+ operator std::string() const { return buffer.str(); }
+};
+
+#define EXPECT_BROKEN_ZIMFILE(ZIMPATH, EXPECTED_STDERROR_TEXT) \
+ CapturedStderr stderror; \
+ EXPECT_FALSE(zim::validate(ZIMPATH, checksToRun)); \
+ EXPECT_EQ(EXPECTED_STDERROR_TEXT, std::string(stderror)) << ZIMPATH;
+
+#define TEST_BROKEN_ZIM_NAME(ZIMNAME, EXPECTED) \
+for(auto& testfile: getDataFilePath(ZIMNAME)) {EXPECT_BROKEN_ZIMFILE(testfile.path, EXPECTED)}
+
+#if WITH_TEST_DATA
+TEST(ZimArchive, validate)
+{
+ zim::IntegrityCheckList all;
+ all.set();
+
+ for(auto& testfile: getDataFilePath("small.zim")) {
+ ASSERT_TRUE(zim::validate(testfile.path, all));
+ }
+
+ zim::IntegrityCheckList checksToRun;
+ checksToRun.set();
+ checksToRun.reset(size_t(zim::IntegrityCheck::CHECKSUM));
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.smaller_than_header.zim",
+ "zim-file is too small to contain a header\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.outofbounds_urlptrpos.zim",
+ "Dirent pointer table outside (or not fully inside) ZIM file.\n"
+ );
+
+ for(auto& testfile: getDataFilePath("invalid.outofbounds_titleptrpos.zim")) {
+ std::string expected;
+ if (testfile.category == "withns") {
+ expected = "Title index table outside (or not fully inside) ZIM file.\n";
+ } else {
+ expected = "Full Title index table outside (or not fully inside) ZIM file.\n";
+ }
+ EXPECT_BROKEN_ZIMFILE(testfile.path, expected)
+ }
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.outofbounds_clusterptrpos.zim",
+ "Cluster pointer table outside (or not fully inside) ZIM file.\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.invalid_mimelistpos.zim",
+ "mimelistPos must be 80.\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.invalid_checksumpos.zim",
+ "Checksum position is not valid\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.outofbounds_first_direntptr.zim",
+ "Invalid dirent pointer\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.outofbounds_last_direntptr.zim",
+ "Invalid dirent pointer\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.outofbounds_first_title_entry.zim",
+ "Invalid title index entry.\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.outofbounds_last_title_entry.zim",
+ "Invalid title index entry.\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.outofbounds_first_clusterptr.zim",
+ "Invalid cluster pointer\n"
+ );
+
+
+ for(auto& testfile: getDataFilePath("invalid.nonsorted_dirent_table.zim")) {
+ std::string expected;
+ if (testfile.category == "withns") {
+ expected = "Dirent table is not properly sorted:\n"
+ " #0: A/main.html\n"
+ " #1: -/favicon\n";
+ } else {
+ expected = "Dirent table is not properly sorted:\n"
+ " #0: C/main.html\n"
+ " #1: C/favicon.png\n";
+ }
+ EXPECT_BROKEN_ZIMFILE(testfile.path, expected)
+ }
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.nonsorted_title_index.zim",
+ "Title index is not properly sorted.\n"
+ );
+
+ TEST_BROKEN_ZIM_NAME(
+ "invalid.bad_mimetype_list.zim",
+ "Error getting mimelists.\n"
+ );
+
+ for(auto& testfile: getDataFilePath("invalid.bad_mimetype_in_dirent.zim")) {
+ std::string expected;
+ if (testfile.category == "withns") {
+ expected = "Entry M/Language has invalid MIME-type value 1234.\n";
+ } else {
+ expected = "Entry M/Publisher has invalid MIME-type value 1234.\n";
+ }
+ EXPECT_BROKEN_ZIMFILE(testfile.path, expected)
+ }
+}
+#endif
+
+void checkEquivalence(const zim::Archive& archive1, const zim::Archive& archive2)
+{
+ EXPECT_EQ(archive1.getFilesize(), archive2.getFilesize());
+ EXPECT_EQ(archive1.getClusterCount(), archive2.getClusterCount());
+
+ ASSERT_EQ(archive1.getEntryCount(), archive2.getEntryCount());
+ const zim::Entry mainEntry = archive1.getMainEntry();
+ ASSERT_EQ(mainEntry.getTitle(), archive2.getMainEntry().getTitle());
+
+ ASSERT_NE(0, archive1.getEntryCount()); // ==> below loop is not a noop
+ {
+ auto range1 = archive1.iterEfficient();
+ auto range2 = archive2.iterEfficient();
+ for ( auto it1=range1.begin(), it2=range2.begin(); it1!=range1.end() && it2!=range2.end(); ++it1, ++it2 ) {
+ auto& entry1 = *it1;
+ auto& entry2 = *it2;
+ ASSERT_EQ(entry1.getIndex(), entry2.getIndex());
+ ASSERT_EQ(entry1.getPath(), entry2.getPath());
+ ASSERT_EQ(entry1.getTitle(), entry2.getTitle());
+ ASSERT_EQ(entry1.isRedirect(), entry2.isRedirect());
+ if (!entry1.isRedirect()) {
+ auto item1 = entry1.getItem();
+ auto item2 = entry2.getItem();
+ ASSERT_EQ(item1.getMimetype(), item2.getMimetype());
+ ASSERT_EQ(item1.getSize(), item2.getSize());
+ ASSERT_EQ(item1.getData(), item2.getData());
+ }
+ }
+ }
+
+ {
+ auto range1 = archive1.iterByPath();
+ auto range2 = archive2.iterByPath();
+ for ( auto it1=range1.begin(), it2=range2.begin(); it1!=range1.end() && it2!=range2.end(); ++it1, ++it2 ) {
+ auto& entry1 = *it1;
+ auto& entry2 = *it2;
+
+ ASSERT_EQ(entry1.getIndex(), entry2.getIndex());
+ }
+ }
+
+ {
+ auto range1 = archive1.iterByTitle();
+ auto range2 = archive2.iterByTitle();
+ for ( auto it1=range1.begin(), it2=range2.begin(); it1!=range1.end() && it2!=range2.end(); ++it1, ++it2 ) {
+ auto& entry1 = *it1;
+ auto& entry2 = *it2;
+
+ ASSERT_EQ(entry1.getIndex(), entry2.getIndex());
+ }
+ }
+
+#if defined(ENABLE_XAPIAN)
+ if ( archive1.hasTitleIndex() )
+ {
+ // Resolve any potential redirect.
+ auto mainItem = mainEntry.getItem(true);
+ zim::SuggestionSearcher searcher1(archive1);
+ zim::SuggestionSearcher searcher2(archive2);
+ std::string query = mainItem.getTitle();
+ auto search1 = searcher1.suggest(query);
+ auto search2 = searcher2.suggest(query);
+ ASSERT_NE(0, search1.getEstimatedMatches());
+ ASSERT_EQ(search1.getEstimatedMatches(), search2.getEstimatedMatches());
+
+ auto result1 = search1.getResults(0, archive1.getEntryCount());
+ auto result2 = search2.getResults(0, archive2.getEntryCount());
+ auto firstSearchItem1 = result1.begin().getEntry().getItem(true);
+ auto firstSearchItem2 = result2.begin().getEntry().getItem(true);
+ ASSERT_EQ(mainItem.getPath(), firstSearchItem1.getPath());
+ ASSERT_EQ(mainItem.getPath(), firstSearchItem2.getPath());
+ ASSERT_EQ(result1.size(), result2.size());
+ }
+#endif
+}
+
+#if WITH_TEST_DATA
+TEST(ZimArchive, multipart)
+{
+ auto nonSplittedZims = getDataFilePath("wikibooks_be_all_nopic_2017-02.zim");
+ auto splittedZims = getDataFilePath("wikibooks_be_all_nopic_2017-02_splitted.zim");
+
+ ASSERT_EQ(nonSplittedZims.size(), splittedZims.size()) << "We must have same number of zim files. (This is a test data issue)";
+ for(auto i=0UL; i < nonSplittedZims.size(); i++) {
+ const zim::Archive archive1(nonSplittedZims[i].path);
+ const zim::Archive archive2(splittedZims[i].path);
+ ASSERT_FALSE(archive1.isMultiPart());
+ ASSERT_TRUE (archive2.isMultiPart());
+
+ checkEquivalence(archive1, archive2);
+ }
+}
+
+#ifdef _WIN32
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <io.h>
+#undef min
+#undef max
+# define OPEN_READ_ONLY(path) _open((path).c_str(), _O_RDONLY)
+#else
+# define OPEN_READ_ONLY(path) open((path).c_str(), O_RDONLY)
+#endif
+
+#ifndef _WIN32
+TEST(ZimArchive, openByFD)
+{
+ for(auto& testfile: getDataFilePath("small.zim")) {
+ const zim::Archive archive1(testfile.path);
+ const int fd = OPEN_READ_ONLY(testfile.path);
+ const zim::Archive archive2(fd);
+
+ checkEquivalence(archive1, archive2);
+ }
+}
+
+TEST(ZimArchive, openZIMFileEmbeddedInAnotherFile)
+{
+ auto normalZims = getDataFilePath("small.zim");
+ auto embeddedZims = getDataFilePath("small.zim.embedded");
+
+ ASSERT_EQ(normalZims.size(), embeddedZims.size()) << "We must have same number of zim files. (This is a test data issue)";
+ for(auto i=0UL; i < normalZims.size(); i++) {
+ const zim::Archive archive1(normalZims[i].path);
+ const int fd = OPEN_READ_ONLY(embeddedZims[i].path);
+ const zim::Archive archive2(fd, 8, archive1.getFilesize());
+
+ checkEquivalence(archive1, archive2);
+ }
+}
+#endif // not _WIN32
+#endif // WITH_TEST_DATA
+
+zim::Blob readItemData(const zim::Item::DirectAccessInfo& dai, zim::size_type size)
+{
+ zim::DEFAULTFS::FD fd(zim::DEFAULTFS::openFile(dai.first));
+ std::shared_ptr<char> data(new char[size]);
+ fd.readAt(data.get(), zim::zsize_t(size), zim::offset_t(dai.second));
+ return zim::Blob(data, size);
+}
+
+#if WITH_TEST_DATA
+TEST(ZimArchive, getDirectAccessInformation)
+{
+ for(auto& testfile:getDataFilePath("small.zim")) {
+ const zim::Archive archive(testfile.path);
+ zim::entry_index_type checkedItemCount = 0;
+ for ( auto entry : archive.iterEfficient() ) {
+ if (!entry.isRedirect()) {
+ const TestContext ctx{ {"entry", entry.getPath() } };
+ const auto item = entry.getItem();
+ const auto dai = item.getDirectAccessInformation();
+ if ( dai.first != "" ) {
+ ++checkedItemCount;
+ EXPECT_EQ(item.getData(), readItemData(dai, item.getSize())) << ctx;
+ }
+ }
+ }
+ ASSERT_NE(0, checkedItemCount);
+ }
+}
+
+#ifndef _WIN32
+TEST(ZimArchive, getDirectAccessInformationInAnArchiveOpenedByFD)
+{
+ for(auto& testfile:getDataFilePath("small.zim")) {
+ const int fd = OPEN_READ_ONLY(testfile.path);
+ const zim::Archive archive(fd);
+ zim::entry_index_type checkedItemCount = 0;
+ for ( auto entry : archive.iterEfficient() ) {
+ if (!entry.isRedirect()) {
+ const TestContext ctx{ {"entry", entry.getPath() } };
+ const auto item = entry.getItem();
+ const auto dai = item.getDirectAccessInformation();
+ if ( dai.first != "" ) {
+ ++checkedItemCount;
+ EXPECT_EQ(item.getData(), readItemData(dai, item.getSize())) << ctx;
+ }
+ }
+ }
+ ASSERT_NE(0, checkedItemCount);
+ }
+}
+
+TEST(ZimArchive, getDirectAccessInformationFromEmbeddedArchive)
+{
+ auto normalZims = getDataFilePath("small.zim");
+ auto embeddedZims = getDataFilePath("small.zim.embedded");
+
+ ASSERT_EQ(normalZims.size(), embeddedZims.size()) << "We must have same number of zim files. (This is a test data issue)";
+ for(auto i=0UL; i < normalZims.size(); i++) {
+ const int fd = OPEN_READ_ONLY(embeddedZims[i].path);
+ const auto size = zim::DEFAULTFS::openFile(normalZims[i].path).getSize();
+ const zim::Archive archive(fd, 8, size.v);
+ zim::entry_index_type checkedItemCount = 0;
+ for ( auto entry : archive.iterEfficient() ) {
+ if (!entry.isRedirect()) {
+ const TestContext ctx{ {"entry", entry.getPath() } };
+ const auto item = entry.getItem();
+ const auto dai = item.getDirectAccessInformation();
+ if ( dai.first != "" ) {
+ ++checkedItemCount;
+ EXPECT_EQ(item.getData(), readItemData(dai, item.getSize())) << ctx;
+ }
+ }
+ }
+ ASSERT_NE(0, checkedItemCount);
+ }
+}
+#endif // not _WIN32
+#endif // WITH_TEST_DATA
+
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "buffer.h"
+#include "bufferstreamer.h"
+#include "endian_tools.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using namespace zim;
+
+////////////////////////////////////////////////////////////////////////////////
+// BufferStreamer
+////////////////////////////////////////////////////////////////////////////////
+
+TEST(BufferStreamer, shouldJustWork)
+{
+ char data[] = "abcdefghijklmnopqrstuvwxyz";
+ zim::toLittleEndian(uint32_t(1234), data);
+ zim::toLittleEndian(int64_t(-987654321), data+18);
+
+ auto buffer = Buffer::makeBuffer(data, zsize_t(sizeof(data)));
+ zim::BufferStreamer bds(buffer, zsize_t(sizeof(data)));
+
+ ASSERT_EQ(1234, bds.read<uint32_t>());
+
+ ASSERT_EQ(data + 4, bds.current());
+ const auto blob1 = std::string(bds.current(), 4);
+ bds.skip(zsize_t(4));
+ ASSERT_EQ("efgh", blob1);
+
+ ASSERT_EQ(data + 8, bds.current());
+ const auto blob2 = std::string(bds.current(), 10);
+ bds.skip(zsize_t(10));
+ ASSERT_EQ("ijklmnopqr", blob2);
+
+ ASSERT_EQ(-987654321, bds.read<int64_t>());
+}
+
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#if defined(_MSC_VER)
+# include <BaseTsd.h>
+ typedef SSIZE_T ssize_t;
+#else
+# include <unistd.h>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <io.h>
+#include <fileapi.h>
+#undef min
+#undef max
+#endif
+
+#include "gtest/gtest.h"
+
+#include <zim/zim.h>
+#include <zim/writer/contentProvider.h>
+
+#include "../src/buffer.h"
+#include "../src/cluster.h"
+#include "../src/file_part.h"
+#include "../src/file_compound.h"
+#include "../src/buffer_reader.h"
+#include "../src/writer/cluster.h"
+#include "../src/endian_tools.h"
+#include "../src/config.h"
+
+#include "tools.h"
+
+namespace
+{
+
+using zim::unittests::TempFile;
+using zim::unittests::write_to_buffer;
+
+TEST(ClusterTest, create_cluster)
+{
+ zim::writer::Cluster cluster(zim::Compression::None);
+
+ ASSERT_EQ(cluster.count().v, 0U);
+
+ std::string blob0("123456789012345678901234567890");
+ std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+ std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+ cluster.addContent(blob0);
+ cluster.addContent(blob1);
+ cluster.addContent(blob2);
+
+ ASSERT_EQ(cluster.count().v, 3U);
+ ASSERT_EQ(cluster.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+ ASSERT_EQ(cluster.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+ ASSERT_EQ(cluster.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+}
+
+TEST(ClusterTest, read_write_cluster)
+{
+ zim::writer::Cluster cluster(zim::Compression::None);
+
+ std::string blob0("123456789012345678901234567890");
+ std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+ std::string blob2("abcdefghijklmnop vwxyz");
+
+ cluster.addContent(blob0);
+ cluster.addContent(blob1);
+ cluster.addContent(blob2);
+
+ cluster.close();
+ auto buffer = write_to_buffer(cluster);
+ const auto cluster2shptr = zim::Cluster::read(zim::BufferReader(buffer), zim::offset_t(0));
+ zim::Cluster& cluster2 = *cluster2shptr;
+ ASSERT_EQ(cluster2.getCompression(), zim::Compression::None);
+ ASSERT_EQ(cluster2.isExtended, false);
+ ASSERT_EQ(cluster2.count().v, 3U);
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+}
+
+TEST(ClusterTest, read_write_no_content)
+{
+ zim::writer::Cluster cluster(zim::Compression::None);
+
+ cluster.close();
+ auto buffer = write_to_buffer(cluster, "\3garbage");
+ const auto cluster2shptr = zim::Cluster::read(zim::BufferReader(buffer), zim::offset_t(0));
+ zim::Cluster& cluster2 = *cluster2shptr;
+ ASSERT_EQ(cluster2.getCompression(), zim::Compression::None);
+ ASSERT_EQ(cluster2.isExtended, false);
+ ASSERT_EQ(cluster2.count().v, 0U);
+}
+
+TEST(ClusterTest, read_write_empty)
+{
+ zim::writer::Cluster cluster(zim::Compression::None);
+
+ std::string emptyString;
+
+ cluster.addContent(emptyString);
+ cluster.addContent(emptyString);
+ cluster.addContent(emptyString);
+
+ cluster.close();
+ auto buffer = write_to_buffer(cluster);
+ const auto cluster2shptr = zim::Cluster::read(zim::BufferReader(buffer), zim::offset_t(0));
+ zim::Cluster& cluster2 = *cluster2shptr;
+ ASSERT_EQ(cluster2.getCompression(), zim::Compression::None);
+ ASSERT_EQ(cluster2.isExtended, false);
+ ASSERT_EQ(cluster2.count().v, 3U);
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, 0U);
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, 0U);
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, 0U);
+}
+
+TEST(ClusterTest, read_write_clusterLzma)
+{
+ zim::writer::Cluster cluster(zim::Compression::Lzma);
+
+ std::string blob0("123456789012345678901234567890");
+ std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+ std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+ cluster.addContent(blob0);
+ cluster.addContent(blob1);
+ cluster.addContent(blob2);
+
+ cluster.close();
+ auto buffer = write_to_buffer(cluster);
+ const auto cluster2shptr = zim::Cluster::read(zim::BufferReader(buffer), zim::offset_t(0));
+ zim::Cluster& cluster2 = *cluster2shptr;
+ ASSERT_EQ(cluster2.isExtended, false);
+ ASSERT_EQ(cluster2.count().v, 3U);
+ ASSERT_EQ(cluster2.getCompression(), zim::Compression::Lzma);
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+ ASSERT_EQ(blob0, std::string(cluster2.getBlob(zim::blob_index_t(0))));
+ ASSERT_EQ(blob1, std::string(cluster2.getBlob(zim::blob_index_t(1))));
+ ASSERT_EQ(blob2, std::string(cluster2.getBlob(zim::blob_index_t(2))));
+}
+
+TEST(ClusterTest, read_write_clusterZstd)
+{
+ zim::writer::Cluster cluster(zim::Compression::Zstd);
+
+ std::string blob0("123456789012345678901234567890");
+ std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+ std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+ cluster.addContent(blob0);
+ cluster.addContent(blob1);
+ cluster.addContent(blob2);
+
+ cluster.close();
+ auto buffer = write_to_buffer(cluster);
+ const auto cluster2shptr = zim::Cluster::read(zim::BufferReader(buffer), zim::offset_t(0));
+ zim::Cluster& cluster2 = *cluster2shptr;
+ ASSERT_EQ(cluster2.isExtended, false);
+ ASSERT_EQ(cluster2.count().v, 3U);
+ ASSERT_EQ(cluster2.getCompression(), zim::Compression::Zstd);
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+ ASSERT_EQ(blob0, std::string(cluster2.getBlob(zim::blob_index_t(0))));
+ ASSERT_EQ(blob1, std::string(cluster2.getBlob(zim::blob_index_t(1))));
+ ASSERT_EQ(blob2, std::string(cluster2.getBlob(zim::blob_index_t(2))));
+}
+
+class FakeProvider : public zim::writer::ContentProvider
+{
+ public:
+ FakeProvider(zim::size_type size)
+ : size(size),
+ offset(0),
+ buffer(new char[1024*1024U])
+ {
+ memset(buffer.get(), 0, 1024*1024U);
+ }
+
+ zim::size_type getSize() const { return size; }
+ zim::Blob feed() {
+ auto outSize = std::min(zim::size_type(1024*1024), size-offset);
+ auto blob = zim::Blob(buffer.get(), outSize);
+ offset += outSize;
+ return blob;
+ }
+
+ private:
+ zim::size_type size;
+ zim::offset_type offset;
+ std::unique_ptr<char[]> buffer;
+};
+
+TEST(ClusterTest, read_write_extended_cluster)
+{
+ //zim::writer doesn't suport 32 bits architectures.
+ if (SIZE_MAX == UINT32_MAX) {
+ return;
+ }
+
+ char* SKIP_BIG_MEMORY_TEST = std::getenv("SKIP_BIG_MEMORY_TEST");
+ if (SKIP_BIG_MEMORY_TEST != nullptr && std::string(SKIP_BIG_MEMORY_TEST) == "1") {
+ std::cout << "Skip big memory test" << std::endl;
+ return;
+ }
+
+ // MEM = 0
+ std::string blob0("123456789012345678901234567890");
+ std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+ std::string blob2("abcdefghijklmnopqrstuvwxyz");
+ const uint64_t FOUR_GIB = 4LL * 1024LL*1024LL*1024LL;
+ zim::size_type almost_4g = FOUR_GIB - 16;
+ auto bigProvider = std::unique_ptr<zim::writer::ContentProvider>(new FakeProvider(almost_4g));
+ std::string blob4("zyxwvutsrqponmlkjihgfedcba");
+
+ zim::writer::Cluster cluster(zim::Compression::None);
+ cluster.addContent(blob0);
+ cluster.addContent(blob1);
+ cluster.addContent(blob2);
+ cluster.addContent(std::move(bigProvider));
+ cluster.addContent(blob4);
+
+ ASSERT_GT(cluster.size().v, FOUR_GIB);
+ ASSERT_EQ(cluster.is_extended(), true);
+
+ auto buffer = write_to_buffer(cluster);
+ // 4GiB
+
+ const auto cluster2shptr = zim::Cluster::read(zim::BufferReader(buffer), zim::offset_t(0));
+ zim::Cluster& cluster2 = *cluster2shptr;
+ ASSERT_EQ(cluster2.isExtended, true);
+ ASSERT_EQ(cluster2.count().v, 5U);
+ ASSERT_EQ(cluster2.getCompression(), zim::Compression::None);
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(3)).v, almost_4g);
+ ASSERT_EQ(blob0, std::string(cluster2.getBlob(zim::blob_index_t(0))));
+ ASSERT_EQ(blob1, std::string(cluster2.getBlob(zim::blob_index_t(1))));
+ ASSERT_EQ(blob2, std::string(cluster2.getBlob(zim::blob_index_t(2))));
+ ASSERT_EQ(blob4, std::string(cluster2.getBlob(zim::blob_index_t(4))));
+}
+
+
+TEST(ClusterTest, read_extended_cluster)
+{
+ char* SKIP_BIG_MEMORY_TEST = std::getenv("SKIP_BIG_MEMORY_TEST");
+ if (SKIP_BIG_MEMORY_TEST != nullptr && std::string(SKIP_BIG_MEMORY_TEST) == "1") {
+ std::cout << "Skip big memory test" << std::endl;
+ return;
+ }
+
+ TempFile tmpfile("extended_cluster");
+ int fd = tmpfile.fd();
+ ssize_t bytes_written;
+
+ std::string blob0("123456789012345678901234567890");
+ std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+ std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+ zim::size_type bigger_than_4g = 1024LL*1024LL*1024LL*4LL+1024LL;
+
+ zim::offset_type offset = 5*sizeof(uint64_t);
+
+ char a = 0x11;
+ bytes_written = write(fd, &a, 1);
+
+ char out_buf[sizeof(uint64_t)];
+
+ zim::toLittleEndian(offset, out_buf);
+ bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+ offset += blob0.size();
+ zim::toLittleEndian(offset, out_buf);
+ bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+ offset += blob1.size();
+ zim::toLittleEndian(offset, out_buf);
+ bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+ offset += blob2.size();
+ zim::toLittleEndian(offset, out_buf);
+ bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+ offset += bigger_than_4g;
+ zim::toLittleEndian(offset, out_buf);
+ bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+ bytes_written = write(fd, blob0.c_str(), blob0.size());
+ ASSERT_EQ(bytes_written, (ssize_t)blob0.size());
+
+ bytes_written = write(fd, blob1.c_str(), blob1.size());
+ ASSERT_EQ(bytes_written, (ssize_t)blob1.size());
+
+ bytes_written = write(fd, blob2.c_str(), blob2.size());
+ ASSERT_EQ(bytes_written, (ssize_t)blob2.size());
+
+#ifdef _WIN32
+# define LSEEK _lseeki64
+#else
+# define LSEEK lseek
+#endif
+ LSEEK(fd , bigger_than_4g-1, SEEK_CUR);
+#undef LSEEK
+// std::fseek(tmpfile, bigger_than_4g-1, SEEK_CUR);
+ a = '\0';
+ bytes_written = write(fd, &a, 1);
+ tmpfile.close();
+
+ auto fileCompound = std::make_shared<zim::FileCompound>(tmpfile.path());
+ const auto cluster2shptr = zim::Cluster::read(zim::MultiPartFileReader(fileCompound), zim::offset_t(0));
+ zim::Cluster& cluster2 = *cluster2shptr;
+ ASSERT_EQ(cluster2.isExtended, true);
+ ASSERT_EQ(cluster2.count().v, 4U);
+ ASSERT_EQ(cluster2.getCompression(), zim::Compression::None);
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+ ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(3)).v, bigger_than_4g);
+
+
+ ASSERT_EQ(blob0, std::string(cluster2.getBlob(zim::blob_index_t(0))));
+ ASSERT_EQ(blob1, std::string(cluster2.getBlob(zim::blob_index_t(1))));
+ ASSERT_EQ(blob2, std::string(cluster2.getBlob(zim::blob_index_t(2))));
+
+ const zim::Blob b = cluster2.getBlob(zim::blob_index_t(3));
+ if (SIZE_MAX == UINT32_MAX) {
+ ASSERT_EQ(b.data(), nullptr);
+ ASSERT_EQ(b.size(), 0U);
+ } else {
+ ASSERT_EQ(b.size(), bigger_than_4g);
+ }
+}
+
+
+} // namespace
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <algorithm>
+#include <memory>
+#include "gtest/gtest.h"
+
+#include <zim/zim.h>
+
+#include "../src/compression.h"
+
+namespace
+{
+
+template<typename T>
+class CompressionTest : public testing::Test {
+ protected:
+ typedef zim::Compressor<T> CompressorT;
+ typedef zim::Uncompressor<T> DecompressorT;
+};
+
+using CompressionAlgo = ::testing::Types<
+ LZMA_INFO,
+ ZSTD_INFO
+>;
+
+TYPED_TEST_CASE(CompressionTest, CompressionAlgo);
+
+TYPED_TEST(CompressionTest, compress) {
+ std::string data;
+ data.reserve(100000);
+ for (int i=0; i<100000; i++) {
+ data.append(1, (char)(i%256));
+ }
+ data[99999] = 0;
+
+ auto initialSizes = std::vector<unsigned int>{32, 1024, 1024*1024};
+ auto chunkSizes = std::vector<unsigned long>{32, 512, 1024*1024};
+ for (auto initialSize: initialSizes) {
+ for (auto chunkSize: chunkSizes) {
+ typename TestFixture::CompressorT compressor(initialSize);
+ {
+ bool first=true;
+ unsigned long size = data.size();
+ size_t offset = 0;
+ while (size) {
+ if (first) {
+ compressor.init(const_cast<char*>(data.c_str()));
+ first = false;
+ }
+ auto adjustedChunkSize = std::min(size, chunkSize);
+ compressor.feed(data.c_str()+offset, adjustedChunkSize);
+ offset += adjustedChunkSize;
+ size -= adjustedChunkSize;
+ }
+ }
+
+ zim::zsize_t comp_size;
+ auto comp_data = compressor.get_data(&comp_size);
+
+ typename TestFixture::DecompressorT decompressor(initialSize);
+ {
+ bool first=true;
+ unsigned long size = comp_size.v;
+ size_t offset = 0;
+ while (size) {
+ if (first) {
+ decompressor.init(comp_data.get());
+ first = false;
+ }
+ auto adjustedChunkSize = std::min(size, chunkSize);
+ decompressor.feed(comp_data.get()+offset, adjustedChunkSize);
+ offset += adjustedChunkSize;
+ size -= adjustedChunkSize;
+ }
+ }
+
+ zim::zsize_t decomp_size;
+ auto decomp_data = decompressor.get_data(&decomp_size);
+
+ ASSERT_EQ(decomp_size.v, data.size());
+ ASSERT_EQ(data, std::string(decomp_data.get(), decomp_size.v));
+ }
+ }
+}
+
+
+} // namespace
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/zim.h>
+#include <zim/writer/creator.h>
+#include <zim/writer/item.h>
+#include <zim/writer/contentProvider.h>
+
+#include "tools.h"
+#include "../src/file_compound.h"
+#include "../src/file_reader.h"
+#include "../src/direntreader.h"
+#include "../src/dirent_accessor.h"
+#include "../src/_dirent.h"
+#include "../src/fileheader.h"
+#include "../src/cluster.h"
+#include "../src/rawstreamreader.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using namespace zim;
+
+struct NoneType {};
+const NoneType None;
+
+template<typename T>
+struct Optional{
+ Optional(NoneType none) : active(false) {};
+ Optional(T value) : active(true), value(value) {};
+ void check(const T& value) { if (active) { ASSERT_EQ(this->value, value); } }
+ bool active;
+ T value;
+};
+
+template<>
+struct Optional<const std::string> {
+ Optional(NoneType none) : active(false) {};
+ Optional(std::string value) : active(true), value(value) {};
+ Optional(const char* value) : active(true), value(value) {};
+ void check(const std::string& value) { if (active) { ASSERT_EQ(this->value, value); } }
+ bool active;
+ std::string value;
+};
+
+void test_article_dirent(
+ std::shared_ptr<const Dirent> dirent,
+ Optional<char> ns,
+ Optional<const std::string> url,
+ Optional<const std::string> title,
+ Optional<uint16_t> mimetype,
+ Optional<cluster_index_t> clusterNumber,
+ Optional<blob_index_t> blobNumber)
+{
+ ASSERT_TRUE(dirent->isArticle());
+ ns.check(dirent->getNamespace());
+ url.check(dirent->getUrl());
+ title.check(dirent->getTitle());
+ mimetype.check(dirent->getMimeType());
+ clusterNumber.check(dirent->getClusterNumber());
+ blobNumber.check(dirent->getBlobNumber());
+}
+
+void test_redirect_dirent(
+ std::shared_ptr<const Dirent> dirent,
+ Optional<char> ns,
+ Optional<const std::string> url,
+ Optional<const std::string> title,
+ Optional<entry_index_t> target)
+{
+ ASSERT_TRUE(dirent->isRedirect());
+ ns.check(dirent->getNamespace());
+ url.check(dirent->getUrl());
+ title.check(dirent->getTitle());
+ target.check(dirent->getRedirectIndex());
+}
+
+TEST(ZimCreator, DoNothing)
+{
+ // Creating a creator instance and do nothing on it should not crash.
+ writer::Creator creator;
+}
+
+TEST(ZimCreator, createEmptyZim)
+{
+ unittests::TempFile temp("emptyzimfile");
+ auto tempPath = temp.path();
+ zim::Uuid uuid;
+ // Force special char in the uuid to be sure they are not handled particularly.
+ uuid.data[5] = '\n';
+ uuid.data[10] = '\0';
+
+ writer::Creator creator;
+ creator.setUuid(uuid);
+ creator.startZimCreation(tempPath);
+ creator.finishZimCreation();
+
+ // Do not use the high level Archive to test that zim file is correctly created but lower structure.
+ auto fileCompound = std::make_shared<FileCompound>(tempPath);
+ auto reader = std::make_shared<MultiPartFileReader>(fileCompound);
+ Fileheader header;
+ header.read(*reader);
+ ASSERT_FALSE(header.hasMainPage());
+ ASSERT_EQ(header.getArticleCount(), 2); // counter + titleListIndexesv0
+
+ //Read the only one item existing.
+ auto urlPtrReader = reader->sub_reader(offset_t(header.getUrlPtrPos()), zsize_t(sizeof(offset_t)*header.getArticleCount()));
+ DirectDirentAccessor direntAccessor(std::make_shared<DirentReader>(reader), std::move(urlPtrReader), entry_index_t(header.getArticleCount()));
+ std::shared_ptr<const Dirent> dirent;
+
+ dirent = direntAccessor.getDirent(entry_index_t(0));
+ test_article_dirent(dirent, 'M', "Counter", None, 1, cluster_index_t(0), None);
+
+ dirent = direntAccessor.getDirent(entry_index_t(1));
+ test_article_dirent(dirent, 'X', "listing/titleOrdered/v0", None, 0, cluster_index_t(1), None);
+ auto v0BlobIndex = dirent->getBlobNumber();
+
+ auto clusterPtrPos = header.getClusterPtrPos();
+ auto clusterOffset = offset_t(reader->read_uint<offset_type>(offset_t(clusterPtrPos+8)));
+ auto cluster = Cluster::read(*reader, clusterOffset);
+ ASSERT_EQ(cluster->getCompression(), Compression::None);
+ ASSERT_EQ(cluster->count(), blob_index_t(1)); // Only titleListIndexesv0
+ auto blob = cluster->getBlob(v0BlobIndex);
+ ASSERT_EQ(blob.size(), 2*sizeof(title_index_t));
+}
+
+
+class TestItem : public writer::Item
+{
+ public:
+ TestItem(const std::string& path, const std::string& title, const std::string& content):
+ path(path), title(title), content(content) { }
+ virtual ~TestItem() = default;
+
+ virtual std::string getPath() const { return path; };
+ virtual std::string getTitle() const { return title; };
+ virtual std::string getMimeType() const { return "text/html"; };
+ virtual writer::Hints getHints() const { return { { writer::FRONT_ARTICLE, 1 } }; }
+
+ virtual std::unique_ptr<writer::ContentProvider> getContentProvider() const {
+ return std::unique_ptr<writer::ContentProvider>(new writer::StringProvider(content));
+ }
+
+ std::string path;
+ std::string title;
+ std::string content;
+};
+
+TEST(ZimCreator, createZim)
+{
+ unittests::TempFile temp("zimfile");
+ auto tempPath = temp.path();
+ zim::Uuid uuid;
+ // Force special char in the uuid to be sure they are not handled particularly.
+ uuid.data[5] = '\n';
+ uuid.data[10] = '\0';
+
+ writer::Creator creator;
+ creator.setUuid(uuid);
+ creator.configIndexing(true, "eng");
+ creator.startZimCreation(tempPath);
+ auto item = std::make_shared<TestItem>("foo", "Foo", "FooContent");
+ creator.addItem(item);
+ // Be sure that title order is not the same that url order
+ item = std::make_shared<TestItem>("foo2", "AFoo", "Foo2Content");
+ creator.addItem(item);
+ creator.addMetadata("Title", "This is a title");
+ creator.addIllustration(48, "PNGBinaryContent48");
+ creator.addIllustration(96, "PNGBinaryContent96");
+ creator.setMainPath("foo");
+ creator.addRedirection("foo3", "FooRedirection", "foo"); // No a front article.
+ creator.addRedirection("foo4", "FooRedirection", "NoExistant"); // Invalid redirection, must be removed by creator
+ creator.finishZimCreation();
+
+ // Do not use the high level Archive to test that zim file is correctly created but lower structure.
+ auto fileCompound = std::make_shared<FileCompound>(tempPath);
+ auto reader = std::make_shared<MultiPartFileReader>(fileCompound);
+ Fileheader header;
+ header.read(*reader);
+ ASSERT_TRUE(header.hasMainPage());
+#if defined(ENABLE_XAPIAN)
+ entry_index_type nb_entry = 12; // counter + 2*illustration + xapiantitleIndex + xapianfulltextIndex + foo + foo2 + foo3 + Title + mainPage + titleListIndexes*2
+ int xapian_mimetype = 0;
+ int listing_mimetype = 1;
+ int png_mimetype = 2;
+ int html_mimetype = 3;
+ int plain_mimetype = 4;
+ int plainutf8_mimetype = 5;
+#else
+ entry_index_type nb_entry = 10; // counter + 2*illustration + foo + foo2 + foo3 + Title + mainPage + titleListIndexes*2
+ int listing_mimetype = 0;
+ int png_mimetype = 1;
+ int html_mimetype = 2;
+ int plain_mimetype = 3;
+ int plainutf8_mimetype = 4;
+#endif
+
+ ASSERT_EQ(header.getArticleCount(), nb_entry);
+
+ // Read dirent
+ auto urlPtrReader = reader->sub_reader(offset_t(header.getUrlPtrPos()), zsize_t(sizeof(offset_t)*header.getArticleCount()));
+ DirectDirentAccessor direntAccessor(std::make_shared<DirentReader>(reader), std::move(urlPtrReader), entry_index_t(header.getArticleCount()));
+ std::shared_ptr<const Dirent> dirent;
+
+ entry_index_type direntIdx = 0;
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'C', "foo", "Foo", html_mimetype, cluster_index_t(0), None);
+ auto fooBlobIndex = dirent->getBlobNumber();
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'C', "foo2", "AFoo", html_mimetype, cluster_index_t(0), None);
+ auto foo2BlobIndex = dirent->getBlobNumber();
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_redirect_dirent(dirent, 'C', "foo3", "FooRedirection", entry_index_t(0));
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'M', "Counter", None, plain_mimetype, cluster_index_t(0), None);
+ auto counterBlobIndex = dirent->getBlobNumber();
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'M', "Illustration_48x48@1", None, png_mimetype, cluster_index_t(1), None);
+ auto illustration48BlobIndex = dirent->getBlobNumber();
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'M', "Illustration_96x96@1", None, png_mimetype, cluster_index_t(1), None);
+ auto illustration96BlobIndex = dirent->getBlobNumber();
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'M', "Title", "Title", plainutf8_mimetype, cluster_index_t(0), None);
+ auto titleBlobIndex = dirent->getBlobNumber();
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_redirect_dirent(dirent, 'W', "mainPage", "mainPage", entry_index_t(0));
+
+#if defined(ENABLE_XAPIAN)
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'X', "fulltext/xapian", "fulltext/xapian", xapian_mimetype, cluster_index_t(1), None);
+#endif
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'X', "listing/titleOrdered/v0", None, listing_mimetype, cluster_index_t(1), None);
+ auto v0BlobIndex = dirent->getBlobNumber();
+
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'X', "listing/titleOrdered/v1", None, listing_mimetype, cluster_index_t(1), None);
+ auto v1BlobIndex = dirent->getBlobNumber();
+
+#if defined(ENABLE_XAPIAN)
+ dirent = direntAccessor.getDirent(entry_index_t(direntIdx++));
+ test_article_dirent(dirent, 'X', "title/xapian", "title/xapian", xapian_mimetype, cluster_index_t(1), None);
+#endif
+
+ auto clusterPtrPos = header.getClusterPtrPos();
+
+ // Test main content
+ auto clusterOffset = offset_t(reader->read_uint<offset_type>(offset_t(clusterPtrPos)));
+ auto cluster = Cluster::read(*reader, clusterOffset);
+ ASSERT_EQ(cluster->getCompression(), Compression::Zstd);
+ ASSERT_EQ(cluster->count(), blob_index_t(4)); // 4 entries are compressed content
+
+ auto blob = cluster->getBlob(fooBlobIndex);
+ ASSERT_EQ(std::string(blob), "FooContent");
+
+ blob = cluster->getBlob(foo2BlobIndex);
+ ASSERT_EQ(std::string(blob), "Foo2Content");
+
+ blob = cluster->getBlob(titleBlobIndex);
+ ASSERT_EQ(std::string(blob), "This is a title");
+
+ blob = cluster->getBlob(counterBlobIndex);
+ ASSERT_EQ(std::string(blob), "text/html=2");
+
+
+ // Test listing content
+ clusterOffset = offset_t(reader->read_uint<offset_type>(offset_t(clusterPtrPos + 8)));
+ cluster = Cluster::read(*reader, clusterOffset);
+ ASSERT_EQ(cluster->getCompression(), Compression::None);
+ ASSERT_EQ(cluster->count(), blob_index_t(nb_entry-6)); // 6 entries are either compressed or redirections
+
+ ASSERT_EQ(header.getTitleIdxPos(), (clusterOffset+cluster->getBlobOffset(v0BlobIndex)).v);
+
+ blob = cluster->getBlob(v0BlobIndex);
+ ASSERT_EQ(blob.size(), nb_entry*sizeof(title_index_t));
+ std::vector<char> blob0Data(blob.data(), blob.end());
+ std::vector<char> expectedBlob0Data = {
+ 1, 0, 0, 0,
+ 0, 0, 0, 0,
+ 2, 0, 0, 0,
+ 3, 0, 0, 0,
+ 4, 0, 0, 0,
+ 5, 0, 0, 0,
+ 6, 0, 0, 0,
+ 7, 0, 0, 0,
+ 8, 0, 0, 0,
+ 9, 0, 0, 0
+#if defined(ENABLE_XAPIAN)
+ ,10, 0, 0, 0
+ ,11, 0, 0, 0
+#endif
+ };
+ ASSERT_EQ(blob0Data, expectedBlob0Data);
+
+ blob = cluster->getBlob(v1BlobIndex);
+ ASSERT_EQ(blob.size(), 2*sizeof(title_index_t));
+ std::vector<char> blob1Data(blob.data(), blob.end());
+ std::vector<char> expectedBlob1Data = {
+ 1, 0, 0, 0,
+ 0, 0, 0, 0
+ };
+ ASSERT_EQ(blob1Data, expectedBlob1Data);
+
+ blob = cluster->getBlob(illustration48BlobIndex);
+ ASSERT_EQ(std::string(blob), "PNGBinaryContent48");
+
+ blob = cluster->getBlob(illustration96BlobIndex);
+ ASSERT_EQ(std::string(blob), "PNGBinaryContent96");
+}
+
+
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "decoderstreamreader.h"
+#include "buffer_reader.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+
+template<class CompressionInfo>
+std::string
+compress(const std::string& data)
+{
+ zim::Compressor<CompressionInfo> compressor(data.size());
+ compressor.init(const_cast<char*>(data.c_str()));
+ compressor.feed(data.c_str(), data.size());
+ zim::zsize_t comp_size;
+ const auto comp_data = compressor.get_data(&comp_size);
+ return std::string(comp_data.get(), comp_size.v);
+}
+
+std::string operator*(const std::string& s, unsigned N)
+{
+ std::string result;
+ for (unsigned i=0; i<N; i++)
+ result += s;
+ return result;
+}
+
+std::string toString(const zim::Buffer& buffer)
+{
+ return std::string(buffer.data(), buffer.size().v);
+}
+
+template<typename T>
+class DecoderStreamReaderTest : public testing::Test {
+ protected:
+ typedef T CompressionInfo;
+};
+
+using CompressionTypes = ::testing::Types<
+ LZMA_INFO,
+ ZSTD_INFO
+>;
+
+TYPED_TEST_CASE(DecoderStreamReaderTest, CompressionTypes);
+
+TYPED_TEST(DecoderStreamReaderTest, justCompressedData) {
+ typedef typename TestFixture::CompressionInfo CompressionInfo;
+
+ const int N = 10;
+ const std::string s("DecoderStreamReader should work correctly");
+ const std::string compDataStr = compress<CompressionInfo>(s*N);
+ auto compData = zim::Buffer::makeBuffer(compDataStr.data(), zim::zsize_t(compDataStr.size()));
+
+ auto compReader = std::make_shared<zim::BufferReader>(compData);
+ zim::DecoderStreamReader<CompressionInfo> dds(compReader);
+ for (int i=0; i<N; i++)
+ {
+ auto decompReader = dds.sub_reader(zim::zsize_t(s.size()));
+ ASSERT_EQ(s, toString(decompReader->get_buffer(zim::offset_t(0), zim::zsize_t(s.size())))) << "i: " << i;
+ }
+}
+
+TYPED_TEST(DecoderStreamReaderTest, compressedDataFollowedByGarbage) {
+ typedef typename TestFixture::CompressionInfo CompressionInfo;
+
+ const int N = 10;
+ const std::string s("DecoderStreamReader should work correctly");
+ std::string compDataStr = compress<CompressionInfo>(s*N);
+ compDataStr += std::string(10, '\0');
+
+ auto compData = zim::Buffer::makeBuffer(compDataStr.data(), zim::zsize_t(compDataStr.size()));
+ auto compReader = std::make_shared<zim::BufferReader>(compData);
+
+ zim::DecoderStreamReader<CompressionInfo> dds(compReader);
+ for (int i=0; i<N; i++)
+ {
+ auto decompReader = dds.sub_reader(zim::zsize_t(s.size()));
+ ASSERT_EQ(s, toString(decompReader->get_buffer(zim::offset_t(0), zim::zsize_t(s.size())))) << "i: " << i;
+ }
+}
+
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+
+#include <zim/writer/contentProvider.h>
+
+#include "../src/writer/defaultIndexData.h"
+#include "gtest/gtest.h"
+
+namespace {
+
+ std::unique_ptr<zim::writer::IndexData> index_data(const std::string& content, const std::string& title)
+ {
+ std::unique_ptr<zim::writer::ContentProvider> contentProvider(new zim::writer::StringProvider(content));
+ return std::unique_ptr<zim::writer::IndexData>(new zim::writer::DefaultIndexData(std::move(contentProvider), title));
+ }
+
+ TEST(DefaultIndexdata, empty) {
+ auto indexData = index_data("", "A Title");
+
+ ASSERT_EQ(indexData->hasIndexData(), false);
+ ASSERT_EQ(indexData->getTitle(), "a title");
+ ASSERT_EQ(indexData->getContent(), "");
+ ASSERT_EQ(indexData->getKeywords(), "");
+ ASSERT_EQ(indexData->getWordCount(), 0);
+ ASSERT_EQ(indexData->getGeoPosition(), std::make_tuple(false, 0, 0));
+ }
+
+ TEST(DefaultIndexdata, simple) {
+ auto indexData = index_data("<html><body>Some <b>bold</b> words</body><html>", "A Title");
+
+ ASSERT_EQ(indexData->hasIndexData(), true);
+ ASSERT_EQ(indexData->getTitle(), "a title");
+ ASSERT_EQ(indexData->getContent(), "some bold words");
+ ASSERT_EQ(indexData->getKeywords(), "");
+ ASSERT_EQ(indexData->getWordCount(), 3);
+ ASSERT_EQ(indexData->getGeoPosition(), std::make_tuple(false, 0, 0));
+ }
+
+ TEST(DefaultIndexdata, noindexhead) {
+ auto indexData = index_data(R"(<html><head><meta name="robots" content="noindex"></head><body>Some <b>bold</b> words</body><html>)", "A Title");
+
+ ASSERT_EQ(indexData->hasIndexData(), false);
+ ASSERT_EQ(indexData->getTitle(), "a title");
+ ASSERT_EQ(indexData->getContent(), "");
+ ASSERT_EQ(indexData->getKeywords(), "");
+ ASSERT_EQ(indexData->getWordCount(), 0);
+ ASSERT_EQ(indexData->getGeoPosition(), std::make_tuple(false, 0, 0));
+ }
+
+ TEST(DefaultIndexdata, noindexnone) {
+ auto indexData = index_data(R"(<html><head><meta name="robots" content="none"></head><body>Some <b>bold</b> words</body><html>)", "A Title");
+
+ ASSERT_EQ(indexData->hasIndexData(), false);
+ ASSERT_EQ(indexData->getTitle(), "a title");
+ ASSERT_EQ(indexData->getContent(), "");
+ ASSERT_EQ(indexData->getKeywords(), "");
+ ASSERT_EQ(indexData->getWordCount(), 0);
+ ASSERT_EQ(indexData->getGeoPosition(), std::make_tuple(false, 0, 0));
+ }
+
+ TEST(DefaultIndexdata, noindexbody) {
+ auto indexData = index_data("<html><body>NOINDEXSome <b>bold</b> words</body><html>", "A Title");
+
+ ASSERT_EQ(indexData->hasIndexData(), false);
+ ASSERT_EQ(indexData->getTitle(), "a title");
+ ASSERT_EQ(indexData->getContent(), "noindexsome bold words");
+ ASSERT_EQ(indexData->getKeywords(), "");
+ ASSERT_EQ(indexData->getWordCount(), 3);
+ ASSERT_EQ(indexData->getGeoPosition(), std::make_tuple(false, 0, 0));
+ }
+
+ TEST(DefaultIndexdata, full) {
+ auto indexData = index_data(R"(<html><head><meta name="keywords" content="some keyword important"><meta name="geo.position" content="45.005;10.100"></head><body>Some <b>bold</b> words</body><html>)", "A Title");
+
+ ASSERT_EQ(indexData->hasIndexData(), true);
+ ASSERT_EQ(indexData->getTitle(), "a title");
+ ASSERT_EQ(indexData->getContent(), "some bold words");
+ ASSERT_EQ(indexData->getKeywords(), "some keyword important");
+ ASSERT_EQ(indexData->getWordCount(), 3);
+ auto geoPos = indexData->getGeoPosition();
+ ASSERT_TRUE(std::get<0>(geoPos));
+ ASSERT_TRUE(std::abs(std::get<1>(geoPos)-45.005) < 0.00001);
+ ASSERT_TRUE(std::abs(std::get<2>(geoPos)-10.1) < 0.00001);
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <memory>
+#include <stdexcept>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <io.h>
+#include <fileapi.h>
+#endif
+
+#include "gtest/gtest.h"
+
+#include "../src/buffer.h"
+#include "../src/_dirent.h"
+#include "../src/direntreader.h"
+#include "../src/buffer_reader.h"
+#include "../src/writer/_dirent.h"
+
+#include "tools.h"
+
+namespace
+{
+
+using zim::unittests::TempFile;
+using zim::unittests::write_to_buffer;
+using zim::writer::NS;
+
+zim::Dirent read_from_buffer(const zim::Buffer& buf)
+{
+ zim::DirentReader direntReader(std::make_shared<zim::BufferReader>(buf));
+ return *direntReader.readDirent(zim::offset_t(0));
+}
+
+size_t writenDirentSize(const zim::writer::Dirent& dirent)
+{
+ TempFile tmpFile("test_dirent");
+ const auto tmp_fd = tmpFile.fd();
+ dirent.write(tmp_fd);
+ auto size = lseek(tmp_fd, 0, SEEK_END);
+ return size;
+}
+
+TEST(DirentTest, size)
+{
+#ifdef _WIN32
+ ASSERT_EQ(sizeof(zim::writer::Dirent), 72);
+#else
+ // Dirent's size is important for us as we are creating huge zim files on linux
+ // and we need to store a lot of dirents.
+ // Be sure that dirent's size is not increased by any change.
+#if ENV32BIT
+ // On 32 bits, Dirent is smaller.
+ ASSERT_EQ(sizeof(zim::writer::Dirent), 30);
+#else
+ ASSERT_EQ(sizeof(zim::writer::Dirent), 38);
+#endif
+#endif
+}
+
+TEST(DirentTest, set_get_data_dirent)
+{
+ zim::Dirent dirent;
+ dirent.setUrl('C', "Bar");
+ dirent.setItem(17, zim::cluster_index_t(45), zim::blob_index_t(1234));
+ dirent.setVersion(54346);
+
+ ASSERT_TRUE(!dirent.isRedirect());
+ ASSERT_EQ(dirent.getNamespace(), 'C');
+ ASSERT_EQ(dirent.getUrl(), "Bar");
+ ASSERT_EQ(dirent.getTitle(), "Bar");
+ ASSERT_EQ(dirent.getParameter(), "");
+ ASSERT_EQ(dirent.getBlobNumber().v, 1234U);
+ ASSERT_EQ(dirent.getVersion(), 54346U);
+
+ dirent.setTitle("Foo");
+ ASSERT_EQ(dirent.getNamespace(), 'C');
+ ASSERT_EQ(dirent.getUrl(), "Bar");
+ ASSERT_EQ(dirent.getTitle(), "Foo");
+ ASSERT_EQ(dirent.getParameter(), "");
+}
+
+TEST(DirentTest, read_write_article_dirent)
+{
+ zim::writer::Dirent dirent(NS::C, "Bar", "Foo", 17);
+ zim::writer::Cluster cluster(zim::Compression::None);
+ cluster.addContent(""); // Add a dummy content
+ cluster.setClusterIndex(zim::cluster_index_t(45));
+ dirent.setCluster(&cluster);
+
+ ASSERT_TRUE(dirent.isItem());
+ ASSERT_EQ(dirent.getNamespace(), NS::C);
+ ASSERT_EQ(dirent.getPath(), "Bar");
+ ASSERT_EQ(dirent.getTitle(), "Foo");
+ ASSERT_EQ(dirent.getClusterNumber().v, 45U);
+ ASSERT_EQ(dirent.getBlobNumber().v, 1U);
+ ASSERT_EQ(dirent.getVersion(), 0U);
+
+ auto buffer = write_to_buffer(dirent);
+ zim::Dirent dirent2(read_from_buffer(buffer));
+
+ ASSERT_TRUE(!dirent2.isRedirect());
+ ASSERT_EQ(dirent2.getNamespace(), 'C');
+ ASSERT_EQ(dirent2.getTitle(), "Foo");
+ ASSERT_EQ(dirent2.getParameter(), "");
+ ASSERT_EQ(dirent2.getClusterNumber().v, 45U);
+ ASSERT_EQ(dirent2.getBlobNumber().v, 1U);
+ ASSERT_EQ(dirent2.getVersion(), 0U);
+}
+
+TEST(DirentTest, read_write_article_dirent_unicode)
+{
+ zim::writer::Dirent dirent(NS::C, "L\xc3\xbcliang", "", 17);
+ zim::writer::Cluster cluster(zim::Compression::None);
+ cluster.addContent(""); // Add a dummy content
+ cluster.setClusterIndex(zim::cluster_index_t(45));
+ dirent.setCluster(&cluster);
+
+ ASSERT_TRUE(dirent.isItem());
+ ASSERT_EQ(dirent.getNamespace(), NS::C);
+ ASSERT_EQ(dirent.getPath(), "L\xc3\xbcliang");
+ ASSERT_EQ(dirent.getTitle(), "L\xc3\xbcliang");
+ ASSERT_EQ(dirent.getClusterNumber().v, 45U);
+ ASSERT_EQ(dirent.getBlobNumber().v, 1U);
+
+ auto buffer = write_to_buffer(dirent);
+ zim::Dirent dirent2(read_from_buffer(buffer));
+
+ ASSERT_TRUE(!dirent2.isRedirect());
+ ASSERT_EQ(dirent2.getNamespace(), 'C');
+ ASSERT_EQ(dirent2.getUrl(), "L\xc3\xbcliang");
+ ASSERT_EQ(dirent2.getTitle(), "L\xc3\xbcliang");
+ ASSERT_EQ(dirent2.getParameter(), "");
+ ASSERT_EQ(dirent2.getClusterNumber().v, 45U);
+ ASSERT_EQ(dirent2.getBlobNumber().v, 1U);
+}
+
+TEST(DirentTest, read_write_redirect_dirent)
+{
+ zim::writer::Dirent targetDirent(NS::C, "Foo", "", 17);
+ targetDirent.setIdx(zim::entry_index_t(321));
+ zim::writer::Dirent dirent(NS::C, "Bar", "", NS::C, "Foo");
+ ASSERT_EQ(dirent.getRedirectNs(), NS::C);
+ ASSERT_EQ(dirent.getRedirectPath(), "Foo");
+ dirent.setRedirect(&targetDirent);
+
+ ASSERT_TRUE(dirent.isRedirect());
+ ASSERT_EQ(dirent.getNamespace(), NS::C);
+ ASSERT_EQ(dirent.getPath(), "Bar");
+ ASSERT_EQ(dirent.getRedirectIndex().v, 321U);
+
+ auto buffer = write_to_buffer(dirent);
+ zim::Dirent dirent2(read_from_buffer(buffer));
+
+ ASSERT_TRUE(dirent2.isRedirect());
+ ASSERT_EQ(dirent2.getNamespace(), 'C');
+ ASSERT_EQ(dirent2.getUrl(), "Bar");
+ ASSERT_EQ(dirent2.getTitle(), "Bar");
+ ASSERT_EQ(dirent2.getRedirectIndex().v, 321U);
+}
+
+TEST(DirentTest, dirent_size)
+{
+ // case url set, title empty, extralen empty
+ zim::writer::Dirent dirent(NS::C, "Bar", "", 17);
+ ASSERT_EQ(dirent.getDirentSize(), writenDirentSize(dirent));
+
+ // case url set, title set, extralen empty
+ zim::writer::Dirent dirent2(NS::C, "Bar", "Foo", 17);
+ ASSERT_EQ(dirent2.getDirentSize(), writenDirentSize(dirent2));
+}
+
+TEST(DirentTest, redirect_dirent_size)
+{
+ zim::writer::Dirent targetDirent(NS::C, "Foo", "", 17);
+ targetDirent.setIdx(zim::entry_index_t(321));
+ zim::writer::Dirent dirent(NS::C, "Bar", "", NS::C, "Foo");
+ dirent.setRedirect(&targetDirent);
+
+ ASSERT_EQ(dirent.getDirentSize(), writenDirentSize(dirent));
+}
+
+} // namespace
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "../src/dirent_lookup.h"
+#include "../src/_dirent.h"
+#include <zim/zim.h>
+
+#include "gtest/gtest.h"
+
+#include <vector>
+#include <string>
+#include <utility>
+
+namespace
+{
+
+const std::vector<std::pair<char, std::string>> articleurl = {
+ {'A', "aa"}, //0
+ {'A', "aaaa"}, //1
+ {'A', "aaaaaa"}, //2
+ {'A', "aaaabb"}, //3
+ {'A', "aaaacc"}, //4
+ {'A', "aabbaa"}, //5
+ {'A', "aabbbb"}, //6
+ {'A', "aabbcc"}, //7
+ {'A', "cccccc"}, //8
+ {'M', "foo"}, //9
+ {'a', "aa"}, //10
+ {'a', "bb"}, //11
+ {'b', "aa"} //12
+};
+
+struct GetDirentMock
+{
+ zim::entry_index_t getDirentCount() const {
+ return zim::entry_index_t(articleurl.size());
+ }
+
+ std::shared_ptr<const zim::Dirent> getDirent(zim::entry_index_t idx) const {
+ auto info = articleurl.at(idx.v);
+ auto ret = std::make_shared<zim::Dirent>();
+ ret->setUrl(info.first, info.second);
+ return ret;
+ }
+};
+
+class NamespaceTest : public :: testing::Test
+{
+ protected:
+ GetDirentMock impl;
+};
+
+TEST_F(NamespaceTest, BeginOffset)
+{
+ auto result = zim::getNamespaceBeginOffset(impl, 'a');
+ ASSERT_EQ(result.v, 10);
+
+ result = zim::getNamespaceBeginOffset(impl, 'b');
+ ASSERT_EQ(result.v, 12);
+
+ result = zim::getNamespaceBeginOffset(impl, 'c');
+ ASSERT_EQ(result.v, 13);
+
+ result = zim::getNamespaceBeginOffset(impl, 'A'-1);
+ ASSERT_EQ(result.v, 0);
+
+ result = zim::getNamespaceBeginOffset(impl, 'A');
+ ASSERT_EQ(result.v, 0);
+
+ result = zim::getNamespaceBeginOffset(impl, 'M');
+ ASSERT_EQ(result.v, 9);
+
+ result = zim::getNamespaceBeginOffset(impl, 'U');
+ ASSERT_EQ(result.v, 10);
+}
+
+TEST_F(NamespaceTest, EndOffset)
+{
+ auto result = zim::getNamespaceEndOffset(impl, 'a');
+ ASSERT_EQ(result.v, 12);
+
+ result = zim::getNamespaceEndOffset(impl, 'b');
+ ASSERT_EQ(result.v, 13);
+
+ result = zim::getNamespaceEndOffset(impl, 'c');
+ ASSERT_EQ(result.v, 13);
+
+ result = zim::getNamespaceEndOffset(impl, 'A'-1);
+ ASSERT_EQ(result.v, 0);
+
+ result = zim::getNamespaceEndOffset(impl, 'A');
+ ASSERT_EQ(result.v, 9);
+
+ result = zim::getNamespaceEndOffset(impl, 'M');
+ ASSERT_EQ(result.v, 10);
+
+ result = zim::getNamespaceEndOffset(impl, 'U');
+ ASSERT_EQ(result.v, 10);
+}
+
+TEST_F(NamespaceTest, EndEqualStartPlus1)
+{
+ for (char ns=32; ns<127; ns++){
+ std::cout << "ns: " << ns << "|" << (int)ns << std::endl;
+ ASSERT_EQ(zim::getNamespaceEndOffset(impl, ns).v, zim::getNamespaceBeginOffset(impl, ns+1).v);
+ }
+}
+
+
+class FindxTest : public :: testing::Test
+{
+ protected:
+ GetDirentMock impl;
+};
+
+TEST_F(FindxTest, ExactMatch)
+{
+ zim::DirentLookup<GetDirentMock> dl(&impl, 4);
+ auto result = dl.find('A', "aa");
+ ASSERT_EQ(result.first, true);
+ ASSERT_EQ(result.second.v, 0);
+
+ result = dl.find('a', "aa");
+ ASSERT_EQ(result.first, true);
+ ASSERT_EQ(result.second.v, 10);
+
+ result = dl.find('A', "aabbbb");
+ ASSERT_EQ(result.first, true);
+ ASSERT_EQ(result.second.v, 6);
+
+ result = dl.find('b', "aa");
+ ASSERT_EQ(result.first, true);
+ ASSERT_EQ(result.second.v, 12);
+}
+
+
+TEST_F(FindxTest, NoExactMatch)
+{
+ zim::DirentLookup<GetDirentMock> dl(&impl, 4);
+ auto result = dl.find('U', "aa"); // No U namespace => return 10 (the index of the first item from the next namespace)
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 10);
+
+ result = dl.find('A', "aabb"); // aabb is between aaaacc (4) and aabbaa (5) => 5
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 5);
+
+ result = dl.find('A', "aabbb"); // aabbb is between aabbaa (5) and aabbbb (6) => 6
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 6);
+
+ result = dl.find('A', "aabbbc"); // aabbbc is between aabbbb (6) and aabbcc (7) => 7
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 7);
+
+ result = dl.find('A', "bb"); // bb is between aabbcc (7) and cccccc (8) => 8
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 8);
+
+ result = dl.find('A', "dd"); // dd is after cccccc (8) => 9
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 9);
+
+ result = dl.find('M', "f"); // f is before foo (9) => 9
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 9);
+
+ result = dl.find('M', "bar"); // bar is before foo (9) => 9
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 9);
+
+ result = dl.find('M', "foo1"); // foo1 is after foo (9) => 10
+ ASSERT_EQ(result.first, false);
+ ASSERT_EQ(result.second.v, 10);
+}
+
+
+} // namespace
--- /dev/null
+/*
+ * Copyright (C) 2009 Miguel Rocha
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/zim.h>
+#include <zim/archive.h>
+#include <zim/error.h>
+
+#include "tools.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+// Not found cases
+
+
+using zim::unittests::getDataFilePath;
+using zim::unittests::TempZimArchive;
+using zim::unittests::TestItem;
+
+// ByTitle
+#if WITH_TEST_DATA
+TEST(FindTests, NotFoundByTitle)
+{
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) {
+ zim::Archive archive (testfile.path);
+
+ auto range0 = archive.findByTitle("unkownTitle");
+ auto range1 = archive.findByTitle("j/body.js");
+ ASSERT_EQ(range0.begin(), range0.end());
+ ASSERT_EQ(range1.begin(), range1.end());
+ }
+}
+
+// By Path
+TEST(FindTests, NotFoundByPath)
+{
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) {
+ zim::Archive archive (testfile.path);
+
+ auto range0 = archive.findByPath("unkwonUrl");
+ auto range1 = archive.findByPath("U/unkwonUrl");
+ auto range2 = archive.findByPath("A/unkwonUrl");
+ auto range3 = archive.findByPath("X");
+ auto range4 = archive.findByPath("X/");
+ ASSERT_EQ(range0.begin(), range0.end());
+ ASSERT_EQ(range1.begin(), range1.end());
+ ASSERT_EQ(range2.begin(), range2.end());
+ ASSERT_EQ(range3.begin(), range3.end());
+ ASSERT_EQ(range4.begin(), range4.end());
+ }
+}
+
+// Found cases
+
+// ByTitle
+TEST(FindTests, ByTitle)
+{
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) {
+ zim::Archive archive (testfile.path);
+
+ auto range0 = archive.findByTitle("Першая старонка");
+
+ auto count = 0;
+ for(auto& entry: range0) {
+ count++;
+ ASSERT_EQ(entry.getTitle().find("Першая старонка"), 0);
+ }
+ if (testfile.category == "withns") {
+ // On the withns test file, there are two entry with this title:
+ // the entry itself and the index.html (a redirection)
+ ASSERT_EQ(count, 2);
+ } else {
+ // On new test file, the main page redirection is store in `W` namespace,
+ // so the findByTitle found only 1 entry in `C` namespace.
+ ASSERT_EQ(count, 1);
+ }
+
+ auto range1 = archive.findByTitle("Украінская");
+
+ count = 0;
+ for(auto& entry: range1) {
+ count++;
+ ASSERT_EQ(entry.getTitle().find("Украінская"), 0);
+ }
+ ASSERT_EQ(count, 5);
+
+ // Offset from end
+ auto range2 = archive.findByTitle("Украінская");
+ range2 = range2.offset(0, 2);
+ count = 0;
+ for(auto& entry: range2) {
+ count++;
+ ASSERT_EQ(entry.getTitle().find("Украінская"), 0);
+ }
+ ASSERT_EQ(count, 2);
+
+ // Offset from start
+ auto range3 = archive.findByTitle("Украінская");
+ range3 = range3.offset(1, 4);
+ count = 0;
+ for(auto& entry: range3) {
+ count++;
+ ASSERT_EQ(entry.getTitle().find("Украінская"), 0);
+ }
+ ASSERT_EQ(count, 4);
+
+ // Offset with more max results greater than the number of results
+ auto range4 = archive.findByTitle("Украінская");
+ range4 = range4.offset(0, 10);
+ count = 0;
+ for(auto& entry: range4) {
+ count++;
+ ASSERT_EQ(entry.getTitle().find("Украінская"), 0);
+ }
+ ASSERT_EQ(count, 5);
+
+ // Offset with start greater than the number of results
+ auto range5 = archive.findByTitle("Украінская");
+ range5 = range5.offset(10, 5);
+ count = 0;
+ for(auto& entry: range5) {
+ count++;
+ ASSERT_EQ(entry.getTitle().find("Украінская"), 0);
+ }
+ ASSERT_EQ(count, 0);
+ }
+}
+
+#define CHECK_FIND_TITLE_COUNT(prefix, expected_count) \
+{ \
+ auto count = 0; \
+ auto range = archive.findByTitle(prefix); \
+ for(auto& entry: range) { \
+ count++; \
+ ASSERT_EQ(entry.getTitle().find(prefix), 0); \
+ } \
+ ASSERT_EQ(count, expected_count); \
+}
+
+TEST(FindTests, ByTitleWithDuplicate)
+{
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.startZimCreation(tza.getPath());
+ creator.addItem(std::make_shared<TestItem>("article0", "text/html", "AAA", ""));
+ creator.addItem(std::make_shared<TestItem>("article1", "text/html", "BB", ""));
+ creator.addItem(std::make_shared<TestItem>("article2", "text/html", "BBB", ""));
+ creator.addItem(std::make_shared<TestItem>("article3", "text/html", "BBB", ""));
+ creator.addItem(std::make_shared<TestItem>("article4", "text/html", "BBBB", ""));
+ creator.addItem(std::make_shared<TestItem>("article5", "text/html", "CCC", ""));
+ creator.addItem(std::make_shared<TestItem>("article6", "text/html", "CCC", ""));
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+ // First binary seach step will look for index 3 (0+6/2) which is a BBB,
+ // but we want to be sure it returns article2 which is the start of the range "BBB*"
+ CHECK_FIND_TITLE_COUNT("BBB", 3)
+ CHECK_FIND_TITLE_COUNT("BB", 4)
+ CHECK_FIND_TITLE_COUNT("BBBB", 1)
+ CHECK_FIND_TITLE_COUNT("CCC", 2)
+ CHECK_FIND_TITLE_COUNT("C", 2)
+}
+
+
+// By Path
+TEST(FindTests, ByPath)
+{
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim", "withns")) {
+ zim::Archive archive (testfile.path);
+
+ auto range0 = archive.findByPath("A/Main_Page.html");
+ auto range1 = archive.findByPath("I/s/");
+ auto range2 = archive.findByPath("-/j/head.js");
+ auto range3 = archive.findByPath("I");
+ auto range4 = archive.findByPath("I/");
+ auto range5 = archive.findByPath("");
+ auto range6 = archive.findByPath("/");
+
+ ASSERT_EQ(range0.begin()->getIndex(), 5);
+ auto count = 0;
+ for(auto& entry: range0) {
+ count++;
+ ASSERT_EQ(entry.getPath().find("A/Main_Page.html"), 0);
+ }
+ ASSERT_EQ(count, 1);
+
+ ASSERT_EQ(range1.begin()->getIndex(), 78);
+ count = 0;
+ for(auto& entry: range1) {
+ count++;
+ std::cout << entry.getPath() << std::endl;
+ ASSERT_EQ(entry.getPath().find("I/s/"), 0);
+ }
+ ASSERT_EQ(count, 31);
+
+ ASSERT_EQ(range2.begin()->getIndex(), 2);
+ count = 0;
+ for(auto& entry: range2) {
+ count++;
+ ASSERT_EQ(entry.getPath().find("-/j/head.js"), 0);
+ }
+ ASSERT_EQ(count, 1);
+
+ ASSERT_EQ(range3.begin()->getIndex(), 75);
+ count = 0;
+ for(auto& entry: range3) {
+ count++;
+ std::cout << entry.getPath() << std::endl;
+ ASSERT_EQ(entry.getPath().find("I"), 0);
+ }
+ ASSERT_EQ(count, 34);
+
+ ASSERT_EQ(range4.begin()->getIndex(), 75);
+ count = 0;
+ for(auto& entry: range4) {
+ count++;
+ std::cout << entry.getPath() << std::endl;
+ ASSERT_EQ(entry.getPath().find("I/"), 0);
+ }
+ ASSERT_EQ(count, 34);
+
+ count = 0;
+ for(auto& entry: range5) {
+ ASSERT_EQ(count, entry.getIndex());
+ count++;
+ }
+ ASSERT_EQ(count, 118);
+
+ count = 0;
+ for(auto& entry: range6) {
+ ASSERT_EQ(count, entry.getIndex());
+ count++;
+ }
+ ASSERT_EQ(count, 118);
+ }
+}
+
+// By Path
+TEST(FindTests, ByPathNons)
+{
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim", "nons")) {
+ zim::Archive archive (testfile.path);
+
+ auto range0 = archive.findByPath("Першая_старонка.html");
+ auto range1 = archive.findByPath("П");
+ auto range2 = archive.findByPath("");
+ auto range3 = archive.findByPath("/");
+
+ auto count = 0;
+ for(auto& entry: range0) {
+ count++;
+ ASSERT_EQ(entry.getPath().find("Першая_старонка.html"), 0);
+ }
+ ASSERT_EQ(count, 1);
+
+ count = 0;
+ for(auto& entry: range1) {
+ count++;
+ std::cout << entry.getPath() << std::endl;
+ ASSERT_EQ(entry.getPath().find("П"), 0);
+ }
+ ASSERT_EQ(count, 2);
+
+ count = 0;
+ for(auto& entry: range2) {
+ ASSERT_EQ(count, entry.getIndex());
+ count++;
+ }
+ ASSERT_EQ(count, 109);
+
+ count = 0;
+ for(auto& entry: range3) {
+ ASSERT_EQ(count, entry.getIndex());
+ count++;
+ }
+ ASSERT_EQ(count, 109);
+ }
+}
+#endif
+
+} // namespace
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <stdexcept>
+#ifdef _WIN32
+#include <windows.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <io.h>
+#include <fileapi.h>
+#endif
+
+#include <iostream>
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+#include "../src/fileheader.h"
+#include "../src/buffer.h"
+#include "../src/buffer_reader.h"
+
+#include "tools.h"
+
+namespace
+{
+
+using zim::unittests::TempFile;
+using zim::unittests::write_to_buffer;
+
+TEST(HeaderTest, read_write_header)
+{
+ zim::Fileheader header;
+ header.setUuid("123456789\0abcd\nf");
+ header.setArticleCount(4711);
+ header.setUrlPtrPos(12345);
+ header.setTitleIdxPos(23456);
+ header.setClusterCount(14);
+ header.setClusterPtrPos(45678);
+ header.setMainPage(11);
+ header.setLayoutPage(13);
+ header.setMimeListPos(72);
+
+ ASSERT_EQ(header.getUuid(), "123456789\0abcd\nf");
+ ASSERT_EQ(header.getArticleCount(), 4711U);
+ ASSERT_EQ(header.getUrlPtrPos(), 12345U);
+ ASSERT_EQ(header.getTitleIdxPos(), 23456U);
+ ASSERT_EQ(header.getClusterCount(), 14U);
+ ASSERT_EQ(header.getClusterPtrPos(), 45678U);
+ ASSERT_EQ(header.getMainPage(), 11U);
+ ASSERT_EQ(header.getLayoutPage(), 13U);
+ ASSERT_EQ(header.getMimeListPos(), 72U);
+
+ auto buffer = write_to_buffer(header);
+ zim::Fileheader header2;
+ header2.read(zim::BufferReader(buffer));
+
+ ASSERT_EQ(header2.getUuid(), "123456789\0abcd\nf");
+ ASSERT_EQ(header2.getArticleCount(), 4711U);
+ ASSERT_EQ(header2.getUrlPtrPos(), 12345U);
+ ASSERT_EQ(header2.getTitleIdxPos(), 23456U);
+ ASSERT_EQ(header2.getClusterCount(), 14U);
+ ASSERT_EQ(header2.getClusterPtrPos(), 45678U);
+ ASSERT_EQ(header2.getMainPage(), 11U);
+ ASSERT_EQ(header2.getLayoutPage(), 13U);
+}
+
+} // namespace
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+#include <zim/archive.h>
+#include <zim/item.h>
+#include <zim/search.h>
+#include <zim/suggestion.h>
+#include <zim/writer/item.h>
+
+#include "tools.h"
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using zim::unittests::TempZimArchive;
+using zim::unittests::TestItem;
+using zim::unittests::IsFrontArticle;
+
+
+class TestIndexData : public zim::writer::IndexData {
+ public:
+ TestIndexData(const std::string& content)
+ : m_content(content)
+ {}
+
+ bool hasIndexData() const { return ! m_content.empty(); }
+ std::string getTitle() const { return ""; }
+ std::string getContent() const { return m_content; }
+ std::string getKeywords() const { return ""; }
+ uint32_t getWordCount() const { return 1; }
+ IndexData::GeoPosition getGeoPosition() const { return std::make_tuple(false, 0.0, 0.0); }
+
+ private:
+ std::string m_content;
+};
+
+class IndexDataItem : public TestItem {
+ public:
+ IndexDataItem(const std::string& path, const std::string& mimetype, const std::string& title, const std::string& content, std::shared_ptr<zim::writer::IndexData> indexData)
+ : TestItem(path, mimetype, title, content),
+ mp_indexData(indexData)
+ {}
+
+ std::shared_ptr<zim::writer::IndexData> getIndexData() const { return mp_indexData; }
+ private:
+ std::shared_ptr<zim::writer::IndexData> mp_indexData;
+};
+
+#if defined(ENABLE_XAPIAN)
+
+TEST(IndexCriteria, defaultIndexingBaseOnMimeType)
+{
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+
+ creator.addItem(
+ std::make_shared<TestItem>("HtmlTestPath", "text/html", "Test Article", "This is a test article")
+ );
+
+ creator.addItem(
+ std::make_shared<TestItem>("OtherTestPath", "text/plain", "Test Article", "This is a test article")
+ );
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::Searcher searcher(archive);
+ zim::Query query("test article");
+ auto search = searcher.search(query);
+
+ ASSERT_EQ(1, search.getEstimatedMatches());
+ auto result = search.getResults(0, archive.getEntryCount());
+ auto begin = result.begin();
+ ASSERT_EQ(begin.getPath(), "HtmlTestPath");
+ ASSERT_EQ(++begin, result.end());
+}
+
+TEST(IndexCriteria, specificIndexData)
+{
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+
+ // Html content is indexed by default
+ creator.addItem(
+ std::make_shared<TestItem>("HtmlTestPath", "text/html", "Test Article", "This is a test article")
+ );
+
+ // Non html content is not indexed by default
+ creator.addItem(
+ std::make_shared<TestItem>("OtherTestPath", "text/plain", "Test Article", "This is a test article")
+ );
+
+ // Item without a IndexData is not indexed
+ creator.addItem(
+ std::make_shared<IndexDataItem>("HtmlTestPathNull", "text/html", "Test Article", "This is a test article", nullptr)
+ );
+
+ // Item with a IndexData but without data is not indexed
+ creator.addItem(
+ std::make_shared<IndexDataItem>("HtmlTestPathNodata", "text/html", "Test Article", "This is a test article",
+ std::make_shared<TestIndexData>(""))
+ );
+
+ // We index the content with the data of the indexdata if provided
+ creator.addItem(
+ std::make_shared<IndexDataItem>("OtherTestPathWithIndex", "text/plain", "Test Article", "This is content",
+ std::make_shared<TestIndexData>("test article"))
+ );
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::Searcher searcher(archive);
+ zim::Query query("test article");
+ auto search = searcher.search(query);
+
+ ASSERT_EQ(2, search.getEstimatedMatches());
+ auto result = search.getResults(0, archive.getEntryCount());
+ auto begin = result.begin();
+ ASSERT_EQ(begin.getPath(), "HtmlTestPath");
+ begin++;
+ ASSERT_EQ(begin.getPath(), "OtherTestPathWithIndex");
+ ASSERT_EQ(++begin, result.end());
+}
+
+#endif // ENABLE_XAPIAN
+
+TEST(IndexCriteria, suggestion) {
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+
+ creator.startZimCreation(tza.getPath());
+
+ // Default html is title indexed
+ creator.addItem(
+ std::make_shared<TestItem>("HtmlTestPath", "text/html", "Test Article", "This is a test article")
+ );
+
+ // Default not html is not title indexed
+ creator.addItem(
+ std::make_shared<TestItem>("OtherTestPath", "text/plain", "Test Article", "This is a test article")
+ );
+
+ // Default redirection is not indexed (even if pointing to html content)
+ creator.addRedirection("Aredirect", "Test Article Redirection", "HtmlTestPath");
+
+ // We can force a html content to not be title indexed
+ creator.addItem(
+ std::make_shared<TestItem>("HtmlTestPathForced", "text/html", "Test Article", "This is a test article", IsFrontArticle::NO)
+ );
+
+ // Default not html is not title indexed
+ creator.addItem(
+ std::make_shared<TestItem>("OtherTestPathForced", "text/plain", "Test Article", "This is a test article", IsFrontArticle::YES)
+ );
+
+ // Redirection need to point to something not already indexed.
+ // As we collapse the suggestion by target path, if we have a redirection to a indexed entry,
+ // the suggestion result will contain only one of them.
+ creator.addRedirection("AredirectForced", "Test Article Redirection", "OtherTestPath", {{zim::writer::FRONT_ARTICLE, 1}});
+
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::SuggestionSearcher suggestionSearcher(archive);
+ auto suggestion = suggestionSearcher.suggest("Test Article");
+
+ ASSERT_EQ(3, suggestion.getEstimatedMatches());
+ auto result = suggestion.getResults(0, archive.getEntryCount());
+ auto begin = result.begin();
+ ASSERT_EQ(begin->getPath(), "HtmlTestPath");
+ begin++;
+ ASSERT_EQ(begin->getPath(), "OtherTestPathForced");
+ begin++;
+ ASSERT_EQ(begin->getPath(), "AredirectForced");
+ ASSERT_EQ(++begin, result.end());
+}
+
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "istreamreader.h"
+#include "endian_tools.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using namespace zim;
+
+////////////////////////////////////////////////////////////////////////////////
+// IDataStream
+////////////////////////////////////////////////////////////////////////////////
+
+// Implement the IStreamReader interface in the simplest way
+class InfiniteZeroStream : public IStreamReader
+{
+ void readImpl(char* buf, zim::zsize_t nbytes) { memset(buf, 0, nbytes.v); }
+};
+
+// ... and test that it compiles and works as intended
+
+TEST(IStreamReader, read)
+{
+ InfiniteZeroStream izs;
+ IStreamReader& ids = izs;
+ EXPECT_EQ(0, ids.read<int>());
+ EXPECT_EQ(0L, ids.read<long>());
+
+ // zim::fromLittleEndian() handles only integer types
+ // EXPECT_EQ(0.0, ids.read<double>());
+}
+
+TEST(IStreamReader, sub_reader)
+{
+ const size_t N = 16;
+ const char zerobuf[N] = {0};
+ InfiniteZeroStream izs;
+ IStreamReader& ids = izs;
+ auto subReader = ids.sub_reader(zim::zsize_t(N));
+ EXPECT_EQ(subReader->size().v, N);
+ auto buffer = subReader->get_buffer(zim::offset_t(0), zim::zsize_t(N));
+ EXPECT_EQ(buffer.size().v, N);
+ EXPECT_EQ(0, memcmp(buffer.data(), zerobuf, N));
+}
+
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2009 Miguel Rocha
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/zim.h>
+#include <zim/archive.h>
+#include <zim/error.h>
+#include <zim/item.h>
+
+#include "tools.h"
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using zim::unittests::getDataFilePath;
+
+#if WITH_TEST_DATA
+TEST(ClusterIteratorTest, getEntryByClusterOrder)
+{
+ std::vector<zim::entry_index_type> expected = {
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 109, 110, 111, 112, 113, 114, 115, 116,
+117, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
+95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108 };
+
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim", "withns")) {
+ zim::Archive archive (testfile.path);
+
+ auto nbEntries = archive.getEntryCount();
+
+ ASSERT_EQ(nbEntries, expected.size());
+
+ for (auto i = 0u; i < nbEntries; i++)
+ {
+ EXPECT_EQ(archive.getEntryByClusterOrder(i).getIndex(), expected[i]);
+ }
+ }
+}
+
+TEST(getEntry, indexOutOfRange)
+{
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim", "withns")) {
+ zim::Archive archive (testfile.path);
+
+ auto nbEntries = archive.getEntryCount();
+
+ try {
+ archive.getEntryByPath(nbEntries);
+ FAIL() << "Should throw exception\n";
+ } catch (std::out_of_range& e) {
+ ASSERT_EQ(e.what(), std::string("entry index out of range"));
+ } catch(...) {
+ FAIL() << "Should throw exception\n";
+ }
+ }
+}
+
+// ByTitle
+TEST(IteratorTests, begin)
+{
+ std::vector<zim::entry_index_type> expected = {
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 109, 110, 111, 112, 113, 114, 115, 116,
+117, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
+95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108 };
+
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim", "withns")) {
+ zim::Archive archive (testfile.path);
+
+ int i = 0;
+ for(auto& entry: archive.iterEfficient()) {
+ EXPECT_EQ(entry.getIndex(), expected[i]);
+ i++;
+ }
+ }
+}
+
+
+// ByTitle
+TEST(IteratorTests, beginByTitle)
+{
+ std::vector<zim::entry_index_type> expected = { 5, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim")) {
+ zim::Archive archive (testfile.path);
+ std::vector<zim::entry_index_type> expected;
+ if (testfile.category == "withns") {
+ expected = { 5, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+ } else {
+ expected = { 41, 42, 43, 44, 45, 46, 47, 48, 49, 50};
+ }
+
+ auto it = archive.iterByTitle().begin();
+
+ int i = 0;
+ while (i < 10)
+ {
+ EXPECT_EQ(it->getIndex(), expected[i]);
+ it++; i++;
+ }
+ std::cout << "\n";
+ }
+}
+
+
+// ByUrl
+TEST(IteratorTests, beginByPath)
+{
+ std::vector<zim::entry_index_type> expected = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+
+ for(auto& testfile:getDataFilePath("wikibooks_be_all_nopic_2017-02.zim", "withns")) {
+ zim::Archive archive (testfile.path);
+
+ auto it = archive.iterByPath().begin();
+ int i = 0;
+ while (i < 10)
+ {
+ EXPECT_EQ(it->getIndex(), expected[i]);
+ it++; i++;
+ }
+ }
+}
+
+TEST(IteartorTests, iteratorFunctions)
+{
+ for(auto& testfile:getDataFilePath("wikipedia_en_climate_change_nopic_2020-01.zim")) {
+ const zim::Archive archive(testfile.path);
+ ASSERT_TRUE(archive.hasTitleIndex());
+ const auto mainItem = archive.getMainEntry().getItem(true);
+ auto range = archive.findByTitle(mainItem.getTitle());
+ ASSERT_EQ(range.size(), 1);
+ auto it1 = range.begin();
+ ASSERT_EQ(it1->getTitle(), mainItem.getTitle());
+
+ auto it2 = range.begin();
+ it2 = it1; // test operator
+ ASSERT_EQ(it2->getTitle(), mainItem.getTitle());
+
+ it1++;
+ ASSERT_EQ(it1, range.end());
+ ASSERT_NO_THROW(it1->getTitle());
+
+ it1--;
+ ASSERT_EQ(it1->getTitle(), mainItem.getTitle());
+ }
+}
+
+#endif
+
+} // namespace
--- /dev/null
+/*
+ * Copyright (c) 2014, lamerman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * * Neither the name of lamerman nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ */
+
+#include "lrucache.h"
+#include "concurrent_cache.h"
+#include "gtest/gtest.h"
+
+const int NUM_OF_TEST1_RECORDS = 100;
+const int NUM_OF_TEST2_RECORDS = 100;
+const int TEST2_CACHE_CAPACITY = 50;
+
+TEST(CacheTest, SimplePut) {
+ zim::lru_cache<int, int> cache_lru(1);
+ cache_lru.put(7, 777);
+ EXPECT_TRUE(cache_lru.exists(7));
+ EXPECT_EQ(777, cache_lru.get(7));
+ EXPECT_EQ(1, cache_lru.size());
+}
+
+TEST(CacheTest, OverwritingPut) {
+ zim::lru_cache<int, int> cache_lru(1);
+ cache_lru.put(7, 777);
+ cache_lru.put(7, 222);
+ EXPECT_TRUE(cache_lru.exists(7));
+ EXPECT_EQ(222, cache_lru.get(7));
+ EXPECT_EQ(1, cache_lru.size());
+}
+
+TEST(CacheTest, MissingValue) {
+ zim::lru_cache<int, int> cache_lru(1);
+ EXPECT_TRUE(cache_lru.get(7).miss());
+ EXPECT_FALSE(cache_lru.get(7).hit());
+ EXPECT_THROW(cache_lru.get(7).value(), std::range_error);
+}
+
+TEST(CacheTest, DropValue) {
+ zim::lru_cache<int, int> cache_lru(3);
+ cache_lru.put(7, 777);
+ cache_lru.put(8, 888);
+ cache_lru.put(9, 999);
+ EXPECT_EQ(3, cache_lru.size());
+ EXPECT_TRUE(cache_lru.exists(7));
+ EXPECT_EQ(777, cache_lru.get(7));
+
+ EXPECT_TRUE(cache_lru.drop(7));
+
+ EXPECT_EQ(2, cache_lru.size());
+ EXPECT_FALSE(cache_lru.exists(7));
+ EXPECT_THROW(cache_lru.get(7).value(), std::range_error);
+
+ EXPECT_FALSE(cache_lru.drop(7));
+}
+
+TEST(CacheTest1, KeepsAllValuesWithinCapacity) {
+ zim::lru_cache<int, int> cache_lru(TEST2_CACHE_CAPACITY);
+
+ for (int i = 0; i < NUM_OF_TEST2_RECORDS; ++i) {
+ cache_lru.put(i, i);
+ }
+
+ for (int i = 0; i < NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY; ++i) {
+ EXPECT_FALSE(cache_lru.exists(i));
+ }
+
+ for (int i = NUM_OF_TEST2_RECORDS - TEST2_CACHE_CAPACITY; i < NUM_OF_TEST2_RECORDS; ++i) {
+ EXPECT_TRUE(cache_lru.exists(i));
+ EXPECT_EQ(i, cache_lru.get(i));
+ }
+
+ size_t size = cache_lru.size();
+ EXPECT_EQ(TEST2_CACHE_CAPACITY, size);
+}
+
+TEST(ConcurrentCacheTest, handleException) {
+ zim::ConcurrentCache<int, int> cache(1);
+ auto val = cache.getOrPut(7, []() { return 777; });
+ EXPECT_EQ(val, 777);
+ EXPECT_THROW(cache.getOrPut(8, []() { throw std::runtime_error("oups"); return 0; }), std::runtime_error);
+ val = cache.getOrPut(8, []() { return 888; });
+ EXPECT_EQ(val, 888);
+}
--- /dev/null
+tests = [
+ 'lrucache',
+ 'cluster',
+ 'creator',
+ 'dirent',
+ 'header',
+ 'uuid',
+ 'template',
+ 'archive',
+ 'iterator',
+ 'reader',
+ 'find',
+ 'compression',
+ 'dirent_lookup',
+ 'istreamreader',
+ 'decoderstreamreader',
+ 'rawstreamreader',
+ 'bufferstreamer',
+ 'parseLongPath',
+ 'random',
+ 'tooltesting',
+ 'tinyString',
+ 'suggestion_iterator',
+ 'indexing_criteria'
+]
+
+if xapian_dep.found()
+ tests += ['search', 'defaultIndexdata', 'search_iterator', 'suggestion']
+endif
+
+datadir = get_option('test_data_dir')
+if datadir == 'none'
+ test_cpp_args = '-DWITH_TEST_DATA=0'
+else
+ test_cpp_args = '-DWITH_TEST_DATA=1'
+ if datadir == ''
+ # We need to download the test data.
+ datadir = join_paths(meson.current_build_dir(), 'data')
+ endif
+ run_target('download_test_data', command : [test_data_downloader, '--remove-top-dir', datadir])
+endif
+
+testenv = environment()
+testenv.set('ZIM_TEST_DATA_DIR', datadir)
+
+if gtest_dep.found() and not meson.is_cross_build()
+ foreach test_name : tests
+ test_exe = executable(test_name, [test_name+'.cpp', 'tools.cpp'],
+ implicit_include_directories: false,
+ include_directories: [include_directory, src_directory],
+ link_with: libzim,
+ link_args: extra_link_args,
+ cpp_args: test_cpp_args,
+ dependencies: deps + [gtest_dep],
+ build_rpath: '$ORIGIN')
+ test(test_name, test_exe, timeout : 120, env: testenv)
+ endforeach
+endif
--- /dev/null
+/*
+ * Copyright (C) 2020 Matthieu Gautier mgautier@kymeria.fr
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "gtest/gtest.h"
+#include <string>
+#include <tuple>
+
+namespace zim {
+ std::tuple<char, std::string> parseLongPath(const std::string& longPath);
+};
+
+using namespace zim;
+
+namespace
+{
+TEST(ParseLongPathTest, invalid)
+{
+ ASSERT_THROW(parseLongPath(""), std::runtime_error);
+ ASSERT_THROW(parseLongPath("AB"), std::runtime_error);
+ ASSERT_THROW(parseLongPath("AB/path"), std::runtime_error);
+ ASSERT_THROW(parseLongPath("/"), std::runtime_error);
+ ASSERT_THROW(parseLongPath("//"), std::runtime_error);
+ ASSERT_THROW(parseLongPath("/AB"), std::runtime_error);
+ ASSERT_THROW(parseLongPath("AB/"), std::runtime_error);
+ ASSERT_THROW(parseLongPath("/AB/path"), std::runtime_error);
+ ASSERT_THROW(parseLongPath("//A/path"), std::runtime_error);
+}
+
+TEST(ParseLongPathTest, valid)
+{
+ char ns;
+ std::string path;
+
+ std::tie(ns, path) = parseLongPath("A/path");
+ ASSERT_EQ(ns, 'A');
+ ASSERT_EQ(path, "path");
+
+ std::tie(ns, path) = parseLongPath("A/p");
+ ASSERT_EQ(ns, 'A');
+ ASSERT_EQ(path, "p");
+
+ std::tie(ns, path) = parseLongPath("/B/path");
+ ASSERT_EQ(ns, 'B');
+ ASSERT_EQ(path, "path");
+
+ std::tie(ns, path) = parseLongPath("/B/p");
+ ASSERT_EQ(ns, 'B');
+ ASSERT_EQ(path, "p");
+
+ std::tie(ns, path) = parseLongPath("C//path");
+ ASSERT_EQ(ns, 'C');
+ ASSERT_EQ(path, "/path");
+
+ std::tie(ns, path) = parseLongPath("/C//path");
+ ASSERT_EQ(ns, 'C');
+ ASSERT_EQ(path, "/path");
+
+ std::tie(ns, path) = parseLongPath("L/path/with/separator");
+ ASSERT_EQ(ns, 'L');
+ ASSERT_EQ(path, "path/with/separator");
+
+ std::tie(ns, path) = parseLongPath("L//path/with/separator");
+ ASSERT_EQ(ns, 'L');
+ ASSERT_EQ(path, "/path/with/separator");
+
+ std::tie(ns, path) = parseLongPath("A");
+ ASSERT_EQ(ns, 'A');
+ ASSERT_EQ(path, "");
+
+ std::tie(ns, path) = parseLongPath("/A");
+ ASSERT_EQ(ns, 'A');
+ ASSERT_EQ(path, "");
+
+ std::tie(ns, path) = parseLongPath("A/");
+ ASSERT_EQ(ns, 'A');
+ ASSERT_EQ(path, "");
+
+ std::tie(ns, path) = parseLongPath("/A/");
+ ASSERT_EQ(ns, 'A');
+ ASSERT_EQ(path, "");
+}
+};
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier mgautier@kymeria.fr
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "gtest/gtest.h"
+
+namespace zim {
+ uint32_t randomNumber(uint32_t max);
+};
+
+using namespace zim;
+
+namespace
+{
+TEST(Random, smallMax)
+{
+ for(auto i=0; i<1000; i++) {
+ ASSERT_EQ(randomNumber(0), 0);
+ }
+
+
+ for(auto i=0; i<1000; i++) {
+ auto r = randomNumber(1);
+ ASSERT_TRUE(r>=0 && r<=1) << r;
+ }
+}
+
+TEST(Random, distribution)
+{
+ const uint32_t NB_NUMBERS = 1000000;
+ const uint32_t NB_BUCKETS = 100;
+ const uint32_t BUCKET_SIZE = NB_NUMBERS/NB_BUCKETS;
+ const uint32_t MAX_RANDOM = 1000000;
+ std::vector<uint32_t> distribution(NB_BUCKETS);
+
+ for (auto i=0U; i<NB_NUMBERS; i++) {
+ auto r = randomNumber(MAX_RANDOM);
+ auto bucket_index = (float)r / MAX_RANDOM * NB_BUCKETS;
+ if (bucket_index == NB_BUCKETS) {
+ // This only happens when r == MAX_RANDOM.
+ bucket_index = NB_BUCKETS-1;
+ }
+ distribution[bucket_index]++;
+ }
+ // Each bucket should have around BUCKET_SIZE element.
+ // Test this is true at 10%
+ for(auto nbElement:distribution) {
+ ASSERT_GT(nbElement, BUCKET_SIZE*0.9);
+ ASSERT_LT(nbElement, BUCKET_SIZE*1.1);
+ }
+}
+
+
+};
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "rawstreamreader.h"
+#include "buffer.h"
+#include "buffer_reader.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using namespace zim;
+
+std::string toString(const Buffer& buffer)
+{
+ return std::string(buffer.data(), buffer.size().v);
+}
+
+TEST(ReaderDataStreamWrapper, shouldJustWork)
+{
+ char data[] = "abcdefghijklmnopqrstuvwxyz";
+ toLittleEndian(uint32_t(1234), data);
+ toLittleEndian(int64_t(-987654321), data+18);
+
+ auto reader = std::make_shared<BufferReader>(Buffer::makeBuffer(data, zsize_t(sizeof(data))));
+
+ RawStreamReader rdr(reader);
+
+ ASSERT_EQ(1234, rdr.read<uint32_t>());
+ auto subbuffer = rdr.sub_reader(zsize_t(4))->get_buffer(offset_t(0), zsize_t(4));
+ ASSERT_EQ("efgh", toString(subbuffer));
+ subbuffer = rdr.sub_reader(zsize_t(10))->get_buffer(offset_t(0), zsize_t(10));
+ ASSERT_EQ("ijklmnopqr", toString(subbuffer));
+ ASSERT_EQ(-987654321, rdr.read<int64_t>());
+}
+
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "tools.h"
+#include "buffer_reader.h"
+#include "file_reader.h"
+#include "fs.h"
+#include "file_compound.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using namespace zim;
+using zim::unittests::makeTempFile;
+
+////////////////////////////////////////////////////////////////////////////////
+// FileReader
+////////////////////////////////////////////////////////////////////////////////
+
+std::unique_ptr<Reader> createFileReader(const char* data, zsize_t size) {
+ const auto tmpfile = makeTempFile("data", data);
+ auto fd = DEFAULTFS::openFile(tmpfile->path());
+ return std::unique_ptr<Reader>(new FileReader(std::make_shared<typename DEFAULTFS::FD>(std::move(fd)), offset_t(0), size));
+}
+
+std::unique_ptr<Reader> createMultiFileReader(const char* data, zsize_t size) {
+ const auto tmpfile = makeTempFile("data", data);
+ auto fileCompound = std::make_shared<FileCompound>(tmpfile->path());
+ return std::unique_ptr<Reader>(new MultiPartFileReader(fileCompound));
+}
+
+std::unique_ptr<Reader> createBufferReader(const char* data, zsize_t size) {
+ auto buffer = Buffer::makeBuffer(data, size);
+ return std::unique_ptr<Reader>(new BufferReader(buffer));
+}
+
+auto createReaders = {
+ createFileReader,
+ createMultiFileReader,
+ createBufferReader
+};
+
+TEST(FileReader, shouldJustWork)
+{
+ char data[] = "abcdefghijklmnopqrstuvwxyz";
+ for(auto& createReader:createReaders) {
+ auto baseOffset = createReader==createBufferReader ? ((offset_type)data) : 0;
+ auto reader = createReader(data, zsize_t(26));
+
+ ASSERT_EQ(offset_t(baseOffset+0), reader->offset());
+ ASSERT_EQ(zsize_t(sizeof(data)-1), reader->size());
+
+ ASSERT_EQ('a', reader->read(offset_t(0)));
+ ASSERT_EQ('e', reader->read(offset_t(4)));
+
+ char out[4] = {0, 0, 0, 0};
+ reader->read(out, offset_t(0), zsize_t(4));
+ ASSERT_EQ(0, memcmp(out, "abcd", 4));
+
+ reader->read(out, offset_t(5), zsize_t(2));
+ ASSERT_EQ(0, memcmp(out, "fgcd", 4));
+
+ reader->read(out, offset_t(10), zsize_t(0));
+ ASSERT_EQ(0, memcmp(out, "fgcd", 4));
+
+ reader->read(out, offset_t(10), zsize_t(4));
+ ASSERT_EQ(0, memcmp(out, "klmn", 4));
+
+ // Can read last bit of the file.
+ ASSERT_EQ('z', reader->read(offset_t(25)));
+ reader->read(out, offset_t(25), zsize_t(1));
+ ASSERT_EQ(0, memcmp(out, "zlmn", 4));
+
+ // Fail if we try to read out of the file.
+ ASSERT_THROW(reader->read(offset_t(26)), std::runtime_error);
+ ASSERT_THROW(reader->read(out, offset_t(25), zsize_t(4)), std::runtime_error);
+ ASSERT_THROW(reader->read(out, offset_t(30), zsize_t(4)), std::runtime_error);
+ ASSERT_THROW(reader->read(out, offset_t(30), zsize_t(0)), std::runtime_error);
+ }
+}
+
+TEST(FileReader, subReader)
+{
+ char data[] = "abcdefghijklmnopqrstuvwxyz";
+ for(auto& createReader:createReaders) {
+ auto baseOffset = createReader==createBufferReader ? ((offset_type)data) : 0;
+ auto reader = createReader(data, zsize_t(26));
+
+ auto subReader = reader->sub_reader(offset_t(4), zsize_t(20));
+
+ ASSERT_EQ(offset_t(baseOffset+4), subReader->offset());
+ ASSERT_EQ(zsize_t(20), subReader->size());
+
+ ASSERT_EQ('e', subReader->read(offset_t(0)));
+ ASSERT_EQ('i', subReader->read(offset_t(4)));
+
+ char out[4] = {0, 0, 0, 0};
+ subReader->read(out, offset_t(0), zsize_t(4));
+ ASSERT_EQ(0, memcmp(out, "efgh", 4));
+
+ subReader->read(out, offset_t(5), zsize_t(2));
+ ASSERT_EQ(0, memcmp(out, "jkgh", 4));
+
+ // Can read last bit of the file.
+ ASSERT_EQ('x', subReader->read(offset_t(19)));
+ subReader->read(out, offset_t(19), zsize_t(1));
+ ASSERT_EQ(0, memcmp(out, "xkgh", 4));
+
+ // Fail if we try to read out of the file.
+ ASSERT_THROW(subReader->read(offset_t(20)), std::runtime_error);
+ ASSERT_THROW(subReader->read(out, offset_t(18), zsize_t(4)), std::runtime_error);
+ ASSERT_THROW(subReader->read(out, offset_t(30), zsize_t(4)), std::runtime_error);
+ ASSERT_THROW(subReader->read(out, offset_t(30), zsize_t(0)), std::runtime_error);
+ }
+}
+
+TEST(FileReader, zeroReader)
+{
+ char data[] = "";
+ for(auto& createReader:createReaders) {
+ auto baseOffset = createReader==createBufferReader ? ((offset_type)data) : 0;
+ auto reader = createReader(data, zsize_t(0));
+
+ ASSERT_EQ(offset_t(baseOffset), reader->offset());
+ ASSERT_EQ(zsize_t(0), reader->size());
+
+ // Fail if we try to read out of the file.
+ ASSERT_THROW(reader->read(offset_t(0)), std::runtime_error);
+ char out[4] = {0, 0, 0, 0};
+ ASSERT_THROW(reader->read(out, offset_t(0), zsize_t(4)), std::runtime_error);
+
+ // Ok to read 0 byte on a 0 sized reader
+ reader->read(out, offset_t(0), zsize_t(0));
+ const char nullarray[] = {0, 0, 0, 0};
+ ASSERT_EQ(0, memcmp(out, nullarray, 4));
+ }
+}
+
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+#include <zim/archive.h>
+#include <zim/item.h>
+#include <zim/search.h>
+
+#include <xapian.h>
+
+#include "tools.h"
+#include "gtest/gtest.h"
+
+namespace
+{
+
+using zim::unittests::TempZimArchive;
+using zim::unittests::TestItem;
+
+std::vector<std::string> getSnippet(const zim::Archive archive, std::string query, int range) {
+ zim::Searcher searcher(archive);
+ zim::Query _query(query);
+ auto search = searcher.search(_query);
+ auto result = search.getResults(0, range);
+
+ std::vector<std::string> snippets;
+ for (auto entry = result.begin(); entry != result.end(); entry++) {
+ snippets.push_back(entry.getSnippet());
+ }
+ return snippets;
+}
+
+#define EXPECT_SNIPPET_EQ(archive, range, query, ...) \
+ ASSERT_EQ( \
+ getSnippet(archive, query, range), \
+ std::vector<std::string>({__VA_ARGS__}) \
+ )
+
+// To secure compatibity of new zim files with older kiwixes, we need to index
+// full path of the entries as data of documents.
+TEST(Search, indexFullPath)
+{
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+
+ auto item = std::make_shared<TestItem>("testPath", "text/html", "Test Article", "This is a test article");
+ creator.addItem(item);
+
+ creator.setMainPath("testPath");
+ creator.addMetadata("Title", "Test zim");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::Searcher searcher(archive);
+ zim::Query query("test article");
+ auto search = searcher.search(query);
+
+ ASSERT_NE(0, search.getEstimatedMatches());
+ auto result = search.getResults(0, archive.getEntryCount());
+ ASSERT_EQ(result.begin().getPath(), "testPath");
+ ASSERT_EQ(result.begin().getDbData().substr(0, 2), "C/");
+}
+
+TEST(Search, fulltextSnippet)
+{
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+ auto item = std::make_shared<TestItem>("testPath", "text/html", "Test Article", "this is the content of a random paragraph without any context");
+ creator.addItem(item);
+
+ creator.setMainPath("testPath");
+ creator.addMetadata("Title", "Test zim");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ EXPECT_SNIPPET_EQ(
+ archive,
+ 1,
+ "random paragraph context",
+ {
+ "this is the content of a <b>random</b> <b>paragraph</b> without any <b>context</b>"
+ }
+ );
+}
+
+TEST(Search, multiSearch)
+{
+ TempZimArchive tza("testZim");
+
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+ creator.addItem(std::make_shared<TestItem>("path0", "text/html", "Test Article0", "This is a test article. temp0"));
+ creator.addItem(std::make_shared<TestItem>("path1", "text/html", "Test Article1", "This is another test article. For article1."));
+ creator.addItem(std::make_shared<TestItem>("path2", "text/html", "Test Article001", "This is a test article. Super. temp0"));
+ creator.addItem(std::make_shared<TestItem>("path3", "text/html", "Test Article2", "This is a test article. Super."));
+ creator.addItem(std::make_shared<TestItem>("path4", "text/html", "Test Article23", "This is a test article. bis."));
+
+ creator.setMainPath("path0");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::Searcher searcher(archive);
+
+ zim::Archive archive2(tza.getPath());
+ searcher.addArchive(archive2);
+
+ searcher.setVerbose(true);
+ zim::Query query("test article");
+ auto search0 = searcher.search(query);
+
+ ASSERT_EQ(archive.getEntryCount(), search0.getEstimatedMatches());
+ auto result0 = search0.getResults(0, 2);
+ ASSERT_EQ(result0.size(), 2);
+ auto it0 = result0.begin();
+
+ auto result1 = search0.getResults(0, 5);
+ ASSERT_EQ(result1.size(), 5);
+ auto it1 = result1.begin();
+
+ ASSERT_EQ(it0.getPath(), it1.getPath());
+ it0++; it1++;
+ ASSERT_EQ(it0.getPath(), it1.getPath());
+ it0++; it1++;
+ ASSERT_EQ(it0, result0.end());
+ it1++;it1++;it1++;
+ ASSERT_EQ(it1, result1.end());
+
+ // Check result retrieval in start ranges
+ auto result2 = search0.getResults(0, 3); // Should return 3 results
+ ASSERT_EQ(result2.size(), 3);
+
+ // Check result retrieval in middle ranges
+ auto result3 = search0.getResults(2, 3); // Should Return 3 result
+ ASSERT_EQ(result3.size(), 3);
+
+ // Be able to do a different search using the same searcher.
+ query.setQuery("super");
+ auto search1 = searcher.search(query);
+ ASSERT_EQ(2, search1.getEstimatedMatches());
+
+ auto searcher2(searcher);
+ searcher2.setVerbose(true);
+ query.setQuery("temp0");
+ auto search2 = searcher2.search(query);
+ auto result = search2.getResults(0, search2.getEstimatedMatches());
+ ASSERT_EQ(2, search2.getEstimatedMatches());
+ ASSERT_EQ(2, result.size());
+}
+
+TEST(Search, noFTIndex)
+{
+ TempZimArchive tza("testZim");
+
+ zim::writer::Creator creator;
+ creator.configIndexing(false, "en");
+ creator.startZimCreation(tza.getPath());
+ creator.addItem(std::make_shared<TestItem>("path0", "text/html", "Test Article0", "This is a test article. temp0"));
+
+ creator.setMainPath("path0");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::Searcher searcher(archive);
+ searcher.setVerbose(true);
+ zim::Query query("test article");
+ ASSERT_THROW(searcher.search(query), std::runtime_error);
+}
+
+TEST(Search, noStemming)
+{
+ TempZimArchive tza("testZim");
+
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "nostem");
+ creator.startZimCreation(tza.getPath());
+ creator.addItem(std::make_shared<TestItem>("path0", "text/html", "Test Article0", "This is a test article. temp0"));
+ creator.addItem(std::make_shared<TestItem>("path1", "text/html", "Test Article1", "This is another test article. For article1."));
+
+ creator.setMainPath("path0");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::Searcher searcher(std::vector<zim::Archive>{});
+ searcher.addArchive(archive);
+ searcher.setVerbose(true);
+
+ zim::Query query("test article");
+ auto search = searcher.search(query);
+
+ ASSERT_EQ(archive.getEntryCount(), search.getEstimatedMatches());
+ auto result = search.getResults(0, 1);
+ ASSERT_EQ(result.begin().getTitle(), "Test Article0");
+}
+
+TEST(Search, geoQuery)
+{
+ TempZimArchive tza("testZim");
+
+ std::string content = R"(<html><head><meta name="keywords" content="some keyword important"><meta name="geo.position" content="45.000;10.000"></head><body>Test geoquery</body><html>)";
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+ creator.addItem(std::make_shared<TestItem>("path0", "text/html", "Test Article", content));
+
+ creator.setMainPath("path0");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::Searcher searcher(archive);
+ searcher.setVerbose(true);
+
+ zim::Query query("geoquery");
+ query.setGeorange(45.000, 10.000, 100);
+ auto search = searcher.search(query);
+
+ ASSERT_EQ(archive.getEntryCount(), search.getEstimatedMatches());
+ auto result = search.getResults(0, 1);
+ ASSERT_EQ(result.begin().getTitle(), "Test Article");
+}
+} // unnamed namespace
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+#include <zim/archive.h>
+#include <zim/search.h>
+#include <zim/search_iterator.h>
+#include <zim/error.h>
+#include "tools.h"
+
+#include "gtest/gtest.h"
+
+namespace {
+
+using zim::unittests::TempZimArchive;
+
+TEST(search_iterator, uninitialized) {
+ zim::SearchResultSet::iterator it;
+ ASSERT_EQ(it.getTitle(), "");
+ ASSERT_EQ(it.getPath(), "");
+ ASSERT_EQ(it.getSnippet(), "");
+ ASSERT_EQ(it.getScore(), 0);
+ ASSERT_EQ(it.getFileIndex(), 0);
+ ASSERT_EQ(it.getWordCount(), -1);
+ ASSERT_EQ(it.getSize(), -1);
+ ASSERT_THROW(it.getZimId(), std::runtime_error);
+ ASSERT_THROW(*it, std::runtime_error);
+ ASSERT_THROW(it.operator->(), std::runtime_error);
+}
+
+TEST(search_iterator, end) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"article 1", "item a"}
+ });
+
+ zim::Searcher searcher(archive);
+ zim::Query query("item");
+ auto search = searcher.search(query);
+ auto result = search.getResults(0, archive.getEntryCount());
+
+ auto it = result.end();
+
+ ASSERT_THROW(it.getTitle(), std::runtime_error);
+ ASSERT_THROW(it.getPath(), std::runtime_error);
+ ASSERT_EQ(it.getSnippet(), "");
+// ASSERT_EQ(it.getScore(), 0); Unspecified, may be 0 or 1. To fix.
+ ASSERT_EQ(it.getFileIndex(), 0);
+ ASSERT_THROW(it.getWordCount(), std::runtime_error);
+ ASSERT_EQ(it.getSize(), -1);
+ ASSERT_THROW(*it, std::runtime_error);
+ ASSERT_THROW(it.operator->(), std::runtime_error);
+}
+
+TEST(search_iterator, copy) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"article 1", "item a"}
+ });
+
+ zim::Searcher searcher(archive);
+ zim::Query query(std::string("item"));
+ auto search = searcher.search(query);
+ auto result = search.getResults(0, archive.getEntryCount());
+
+ auto it = result.begin();
+
+ auto it2 = it;
+ ASSERT_EQ(it.getTitle(), it2.getTitle());
+
+ it = result.end();
+ it2 = it;
+ ASSERT_EQ(it, it2);
+ ASSERT_THROW(it.getTitle(), std::runtime_error);
+ ASSERT_THROW(it2.getTitle(), std::runtime_error);
+}
+
+TEST(search_iterator, functions) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"item a", "item item item"},
+ {"Item B", "item item 2"},
+ {"iTem ć", "item number 3"} // forcing an order using wdf
+ });
+
+ zim::Searcher searcher(archive);
+ zim::Query query("item");
+ auto search = searcher.search(query);
+ auto result = search.getResults(0, archive.getEntryCount());
+
+ auto it = result.begin();
+
+ // Test functions
+ ASSERT_EQ(it.getTitle(), "item a");
+ ASSERT_EQ(it.getPath(), "dummyPathitem a");
+ ASSERT_EQ(it.getScore(), 100);
+ ASSERT_EQ(it.getFileIndex(), 0);
+ ASSERT_EQ(it.getZimId(), archive.getUuid());
+ ASSERT_EQ(it.getWordCount(), 3);
+ ASSERT_EQ(it.getSize(), -1); // Unimplemented
+
+ // Check getTitle for accents/cased text
+ it++;
+ ASSERT_EQ(it.getTitle(), "Item B");
+ it++;
+ ASSERT_EQ(it.getTitle(), "iTem ć");
+}
+
+TEST(search_iterator, iteration) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"article 1", "item"},
+ {"article 2", "another item in article 2"} // different wdf
+ });
+
+ zim::Searcher searcher(archive);
+ auto search = searcher.search(std::string("item"));
+ auto result = search.getResults(0, archive.getEntryCount());
+
+ auto it = result.begin();
+ ASSERT_EQ(it.getTitle(), result.begin().getTitle());
+
+ ASSERT_EQ(it.getTitle(), "article 1");
+ it++;
+ ASSERT_EQ(it.getTitle(), "article 2");
+ ASSERT_TRUE(it != result.begin());
+
+ it--;
+ ASSERT_EQ(it.getTitle(), "article 1");
+ ASSERT_TRUE(result.begin() == it);
+
+ it++; it++;
+ ASSERT_TRUE(it == result.end());
+}
+
+} // anonymous namespace
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+
+#include <zim/archive.h>
+#include <zim/suggestion.h>
+#include <zim/item.h>
+
+#include "tools.h"
+
+#include "gtest/gtest.h"
+
+namespace {
+
+ using zim::unittests::TempZimArchive;
+ using zim::unittests::TestItem;
+ using zim::unittests::getDataFilePath;
+
+ std::vector<std::string> getSuggestions(const zim::Archive archive, std::string query, int range) {
+ zim::SuggestionSearcher suggestionSearcher(archive);
+ suggestionSearcher.setVerbose(true);
+ auto suggestionSearch = suggestionSearcher.suggest(query);
+ auto suggestionResult = suggestionSearch.getResults(0, range);
+
+ std::vector<std::string> result;
+ for (auto entry : suggestionResult) {
+ result.push_back(entry.getTitle());
+ }
+ return result;
+ }
+
+ std::vector<std::string> getSnippet(const zim::Archive archive, std::string query, int range) {
+ zim::SuggestionSearcher suggestionSearcher(archive);
+ auto suggestionSearch = suggestionSearcher.suggest(query);
+ auto result = suggestionSearch.getResults(0, range);
+
+ std::vector<std::string> snippets;
+ for (auto entry : result) {
+ snippets.push_back(entry.getSnippet());
+ }
+ return snippets;
+ }
+
+#define EXPECT_SUGGESTION_RESULTS(archive, query, ...) \
+ ASSERT_EQ( \
+ getSuggestions(archive, query, archive.getEntryCount()), \
+ std::vector<std::string>({__VA_ARGS__}) \
+ )
+
+#define EXPECT_SNIPPET_EQ(archive, range, query, ...) \
+ ASSERT_EQ( \
+ getSnippet(archive, query, range), \
+ std::vector<std::string>({__VA_ARGS__}) \
+ ) \
+
+#if WITH_TEST_DATA
+TEST(Suggestion, searchByTitle)
+{
+ for(auto& testfile:getDataFilePath("small.zim")) {
+ const zim::Archive archive(testfile.path);
+ ASSERT_TRUE(archive.hasTitleIndex());
+ const auto mainItem = archive.getMainEntry().getItem(true);
+ zim::SuggestionSearcher suggestionSearcher(archive);
+ auto suggestionSearch = suggestionSearcher.suggest(mainItem.getTitle());
+ ASSERT_NE(0, suggestionSearch.getEstimatedMatches());
+ auto result = suggestionSearch.getResults(0, archive.getEntryCount());
+ ASSERT_EQ(mainItem.getPath(), result.begin()->getPath());
+ }
+}
+#endif
+
+
+ TEST(Suggestion, emptyQuery) {
+ std::vector<std::string> titles = {
+ "fooland",
+ "berlin wall",
+ "hotel berlin, berlin",
+ "again berlin",
+ "berlin",
+ "not berlin"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ std::vector<std::string> resultSet = getSuggestions(archive, "", archive.getEntryCount());
+ std::vector<std::string> expectedResult = {};
+
+ ASSERT_EQ(resultSet, expectedResult);
+ }
+
+ TEST(Suggestion, noResult) {
+ std::vector<std::string> titles = {
+ "fooland"
+ "berlin wall",
+ "hotel berlin, berlin",
+ "again berlin",
+ "berlin",
+ "not berlin"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ std::vector<std::string> resultSet = getSuggestions(archive, "none", archive.getEntryCount());
+ std::vector<std::string> expectedResult = {};
+
+ ASSERT_EQ(resultSet, expectedResult);
+ }
+
+ TEST(Suggestion, singleTermOrder) {
+ std::vector<std::string> titles = {
+ "fooland",
+ "berlin wall",
+ "hotel berlin, berlin",
+ "again berlin",
+ "berlin",
+ "not berlin"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ std::vector<std::string> resultSet = getSuggestions(archive, "berlin", archive.getEntryCount());
+ std::vector<std::string> expectedResult = {
+ "berlin",
+ "berlin wall",
+ "hotel berlin, berlin",
+ "again berlin",
+ "not berlin"
+ };
+
+ ASSERT_EQ(expectedResult , resultSet);
+ }
+
+ TEST(Suggestion, resultsGreaterThanLimit) {
+ std::vector<std::string> titles = {
+ "foobar b",
+ "foobar a",
+ "foobar c",
+ "foobar e",
+ "foobar d"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ std::vector<std::string> resultSet = getSuggestions(archive, "foobar", 2);
+ std::vector<std::string> expectedResult = {
+ "foobar a",
+ "foobar b"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+ }
+
+ TEST(Suggestion, partialQuery) {
+ std::vector<std::string> titles = {
+ "The chocolate factory",
+ "The wolf of Shingashina",
+ "The wolf of Wall Street",
+ "Hour of the wolf",
+ "Wolf",
+ "Terma termb the wolf of wall street termc"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ // "wo"
+ std::vector<std::string> resultSet = getSuggestions(archive, "Wo", archive.getEntryCount());
+ std::vector<std::string> expectedResult = {
+ "Wolf",
+ "Hour of the wolf",
+ "The wolf of Shingashina",
+ "The wolf of Wall Street",
+ "Terma termb the wolf of wall street termc"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+ }
+
+ TEST(Suggestion, phraseOrder) {
+ std::vector<std::string> titles = {
+ "summer winter autumn",
+ "winter autumn summer terma",
+ "autumn summer winter",
+ "control document",
+ "summer",
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ std::vector<std::string> resultSet = getSuggestions(archive, "winter autumn summer", archive.getEntryCount());
+ std::vector<std::string> expectedResult = {
+ "winter autumn summer terma",
+ "autumn summer winter",
+ "summer winter autumn"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+ }
+
+ TEST(Suggestion, incrementalSearch) {
+ std::vector<std::string> titles = {
+ "The chocolate factory",
+ "The wolf of Shingashina",
+ "The wolf of Wall Street",
+ "The wolf among sheeps",
+ "The wolf of Wall Street Book" ,
+ "Hour of the wolf",
+ "Wolf",
+ "Terma termb the wolf of wall street termc"
+ };
+
+ std::vector<std::string> resultSet, expectedResult;
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ // "wolf"
+ resultSet = getSuggestions(archive, "Wolf", archive.getEntryCount());
+ expectedResult = {
+ "Wolf",
+ "Hour of the wolf",
+ "The wolf among sheeps",
+ "The wolf of Shingashina",
+ "The wolf of Wall Street",
+ "The wolf of Wall Street Book",
+ "Terma termb the wolf of wall street termc"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+
+ // "the"
+ resultSet = getSuggestions(archive, "the", archive.getEntryCount());
+ expectedResult = {
+ "The chocolate factory",
+ "The wolf among sheeps",
+ "The wolf of Shingashina",
+ "The wolf of Wall Street",
+ "The wolf of Wall Street Book",
+ "Hour of the wolf",
+ "Terma termb the wolf of wall street termc"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+
+ // "the wolf"
+ resultSet = getSuggestions(archive, "the wolf", archive.getEntryCount());
+ expectedResult = {
+ "The wolf among sheeps",
+ "The wolf of Shingashina",
+ "The wolf of Wall Street",
+ "The wolf of Wall Street Book",
+ "Hour of the wolf",
+ "Terma termb the wolf of wall street termc"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+
+ // "the wolf of"
+ resultSet = getSuggestions(archive, "the wolf of", archive.getEntryCount());
+ expectedResult = {
+ "The wolf of Shingashina",
+ "The wolf of Wall Street",
+ "The wolf of Wall Street Book",
+ "Terma termb the wolf of wall street termc",
+ "Hour of the wolf"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+
+ // "the wolf of wall"
+ resultSet = getSuggestions(archive, "the wolf of wall", archive.getEntryCount());
+ expectedResult = {
+ "The wolf of Wall Street",
+ "The wolf of Wall Street Book",
+ "Terma termb the wolf of wall street termc"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+ }
+
+ TEST(Suggestion, phraseOutOfWindow) {
+ std::vector<std::string> titles = {
+ "This query",
+ "This is the dummy query phrase",
+ "the aterm bterm dummy cterm query",
+ "aterm the bterm dummy query cterm"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ std::vector<std::string> resultSet = getSuggestions(archive, "the dummy query", archive.getEntryCount());
+ std::vector<std::string> expectedResult = {
+ "This is the dummy query phrase",
+ "aterm the bterm dummy query cterm",
+ "the aterm bterm dummy cterm query"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+ }
+
+ TEST(Suggestion, checkStopword) {
+ std::vector<std::string> titles = {
+ "she and the apple",
+ "apple",
+ "she and the"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ // "she", "and", "the" are stopwords, If stopwords are properly handled, they
+ // should be included in the result documents.
+ std::vector<std::string> resultSet = getSuggestions(archive, "she and the apple", archive.getEntryCount());
+ std::vector<std::string> expectedResult = {
+ "she and the apple"
+ };
+ ASSERT_EQ(expectedResult, resultSet);
+ }
+
+ TEST(Suggestion, checkRedirectionCollapse) {
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+
+ auto item = std::make_shared<TestItem>("testPath", "text/html", "Article Target");
+ creator.addItem(item);
+ creator.addRedirection("redirectionPath1", "Article Redirect 1", "testPath");
+ creator.addRedirection("redirectionPath2", "Article Redirect 2", "testPath");
+
+ creator.addMetadata("Title", "Test zim");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+ std::vector<std::string> resultSet = getSuggestions(archive, "Article", archive.getEntryCount());
+
+ // We should get only one result
+ std::vector<std::string> expectedResult = {
+ "Article Target",
+ };
+ ASSERT_EQ(resultSet, expectedResult);
+ }
+
+ TEST(Suggestion, checkRedirectionChain) {
+ /*
+ * As of now, we do not handle redirection chain. So if we have articles such
+ * as A->B->C. Even if A B and C are essentially the same articles, They won't
+ * get collapsed as one.
+ */
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+
+ auto item = std::make_shared<TestItem>("testPath", "text/html", "Article Target");
+ creator.addItem(item);
+ creator.addRedirection("redirectionPath1", "Article Redirect 1", "testPath");
+ creator.addRedirection("redirectionPath2", "Article Redirect 2", "redirectionPath1", {{zim::writer::FRONT_ARTICLE, 1}});
+
+ creator.addMetadata("Title", "Test zim");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+ std::vector<std::string> resultSet = getSuggestions(archive, "Article", archive.getEntryCount());
+
+ // We should get only one result
+ std::vector<std::string> expectedResult = {
+ "Article Target",
+ "Article Redirect 2"
+ };
+ ASSERT_EQ(resultSet, expectedResult);
+ }
+
+ // Different articles with same title should not be collapsed in suggestions
+ TEST(Suggestion, diffArticleSameTitle) {
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+
+ auto item1 = std::make_shared<TestItem>("testPath1", "text/html", "Test Article");
+ auto item2 = std::make_shared<TestItem>("testPath2", "text/html", "Test Article");
+ creator.addItem(item1);
+ creator.addItem(item2);
+
+ creator.addMetadata("Title", "Test zim");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+ std::vector<std::string> resultSet = getSuggestions(archive, "Test Article", archive.getEntryCount());
+
+ // We should get two results
+ std::vector<std::string> expectedResult = {
+ "Test Article",
+ "Test Article"
+ };
+ ASSERT_EQ(resultSet, expectedResult);
+ }
+
+ // Titles which begins with the search string should have higher relevance
+ TEST(Suggestion, anchorQueryToBeginning) {
+ std::vector<std::string> titles = {
+ "aterm bterm this is a title cterm",
+ "this is a title aterm bterm cterm",
+ "aterm this is a title bterm cterm"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ std::vector<std::string> resultSet = getSuggestions(archive, "This is a title", archive.getEntryCount());
+ std::vector<std::string> expectedResult = {
+ "this is a title aterm bterm cterm",
+ "aterm bterm this is a title cterm",
+ "aterm this is a title bterm cterm"
+ };
+
+ ASSERT_EQ(expectedResult, resultSet);
+ }
+
+ // To secure compatibity of new zim files with older kiwixes, we need to index
+ // full path of the entries as data of documents.
+ TEST(Suggestion, indexFullPath) {
+ TempZimArchive tza("testZim");
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(tza.getPath());
+
+ auto item = std::make_shared<TestItem>("testPath", "text/html", "Test Article");
+ creator.addItem(item);
+
+ creator.addMetadata("Title", "Test zim");
+ creator.finishZimCreation();
+
+ zim::Archive archive(tza.getPath());
+
+ zim::SuggestionSearcher suggestionSearcher(archive);
+ auto suggestionSearch = suggestionSearcher.suggest("Test Article");
+ auto result = suggestionSearch.getResults(0, archive.getEntryCount());
+
+ ASSERT_EQ(result.begin()->getPath(), "testPath");
+ ASSERT_EQ(result.begin().getDbData().substr(0, 2), "C/");
+ }
+
+ TEST(Suggestion, nonWordCharacters) {
+ TempZimArchive tza("testZim");
+ {
+ const zim::Archive archive = tza.createZimFromTitles({
+ "Alice Bob",
+ "Bonnie + Clyde",
+ "Jack & Jill, on the hill"
+ });
+
+ EXPECT_SUGGESTION_RESULTS(archive, "Alice & Bob",
+ "Alice Bob"
+ );
+
+ EXPECT_SUGGESTION_RESULTS(archive, "Bonnie + Clyde",
+ "Bonnie + Clyde"
+ );
+
+ EXPECT_SUGGESTION_RESULTS(archive, "Jack & Jill",
+ "Jack & Jill, on the hill"
+ );
+ }
+ }
+
+ TEST(Suggestion, titleSnippet) {
+ TempZimArchive tza("testzim");
+
+ const zim::Archive archive = tza.createZimFromTitles({
+ "this is a straight run of matching words",
+ "this is a broken set of likely words",
+ "this is a long title to ensure that the snippets generated contain the entire title even if match is one word"
+ });
+
+ EXPECT_SNIPPET_EQ(
+ archive,
+ 1,
+ "straight run of matching",
+ {
+ "this is a <b>straight</b> <b>run</b> <b>of</b> <b>matching</b> words"
+ }
+ );
+
+ EXPECT_SNIPPET_EQ(
+ archive,
+ 1,
+ "broken likely",
+ {
+ "this is a <b>broken</b> set of <b>likely</b> words"
+ }
+ );
+
+ EXPECT_SNIPPET_EQ(
+ archive,
+ 1,
+ "generated",
+ {
+ "this is a long title to ensure that the snippets <b>generated</b> contain the entire title even if match is one word"
+ }
+ );
+
+ EXPECT_SNIPPET_EQ(
+ archive,
+ archive.getEntryCount(),
+ "this is",
+ {
+ "<b>this</b> <b>is</b> a broken set of likely words",
+ "<b>this</b> <b>is</b> a straight run of matching words",
+ "<b>this</b> <b>is</b> a long title to ensure that the snippets generated contain the entire title even if match <b>is</b> one word"
+ }
+ );
+ }
+
+ TEST(Suggestion, reuseSearcher) {
+ std::vector<std::string> titles = {
+ "song for you",
+ "sing a song for you",
+ "a song b for c you",
+ "song for someone"
+ };
+
+ TempZimArchive tza("testZim");
+ const zim::Archive archive = tza.createZimFromTitles(titles);
+
+ zim::SuggestionSearcher suggestionSearcher(archive);
+ suggestionSearcher.setVerbose(true);
+ auto suggestionSearch1 = suggestionSearcher.suggest("song for you");
+ auto suggestionResult1 = suggestionSearch1.getResults(0, 2);
+
+ int count = 0;
+ for (auto entry : suggestionResult1) {
+ count++;
+ }
+
+ auto suggestionSearch2 = suggestionSearcher.suggest("song for you");
+ auto suggestionResult2 = suggestionSearch2.getResults(2, archive.getEntryCount());
+
+ for (auto entry : suggestionResult2) {
+ count++;
+ }
+ ASSERT_EQ(count, 3);
+ }
+}
--- /dev/null
+/*
+ * Copyright (C) 2021 Maneesh P M <manu.pm55@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#define ZIM_PRIVATE
+#include <zim/archive.h>
+#include <zim/suggestion.h>
+#include <zim/suggestion_iterator.h>
+#include <zim/error.h>
+#include "tools.h"
+
+#include "gtest/gtest.h"
+
+namespace {
+
+using zim::unittests::TempZimArchive;
+
+#if defined(ENABLE_XAPIAN)
+
+TEST(suggestion_iterator, end) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"article 1", "item a"}
+ });
+
+ zim::SuggestionSearcher searcher(archive);
+ auto search = searcher.suggest("item");
+ auto result = search.getResults(0, archive.getEntryCount());
+
+ auto it = result.end();
+
+ ASSERT_THROW(it.getEntry(), std::runtime_error);
+ ASSERT_THROW(*it, std::runtime_error);
+}
+
+TEST(suggestion_iterator, copy) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"article 1", "item a"}
+ });
+
+ zim::SuggestionSearcher searcher(archive);
+ auto search = searcher.suggest("article");
+ auto result = search.getResults(0, archive.getEntryCount());
+
+ auto it = result.begin();
+
+ auto it2 = it;
+ ASSERT_EQ(it->getTitle(), it2->getTitle());
+
+ it = result.end();
+ it2 = it;
+ ASSERT_EQ(it, it2);
+ ASSERT_THROW(it->getTitle(), std::runtime_error);
+ ASSERT_THROW(it2->getTitle(), std::runtime_error);
+}
+
+TEST(suggestion_iterator, functions) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"article 1", "item a"}
+ });
+
+ zim::SuggestionSearcher searcher(archive);
+ auto search = searcher.suggest("article");
+ auto result = search.getResults(0, archive.getEntryCount());
+
+ auto it = result.begin();
+
+ // Test functions
+ ASSERT_EQ(it->getTitle(), "article 1");
+ ASSERT_EQ(it->getPath(), "dummyPatharticle 1");
+
+ auto entry = it.getEntry();
+ ASSERT_EQ(entry.getTitle(), "article 1");
+}
+
+TEST(suggestion_iterator, iteration) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"article a", "item a"},
+ {"article b", "item b"}
+ });
+
+ zim::SuggestionSearcher searcher(archive);
+ auto search = searcher.suggest("article");
+ auto result = search.getResults(0, archive.getEntryCount());
+ auto it1 = result.begin();
+
+ zim::SuggestionIterator it = it1;
+ ASSERT_EQ(it->getTitle(), result.begin()->getTitle());
+
+ ASSERT_EQ(it->getTitle(), "article a");
+ it++;
+ ASSERT_EQ(it->getTitle(), "article b");
+ ASSERT_TRUE(it != it1);
+ ASSERT_FALSE(it == it1);
+
+ it--;
+ ASSERT_EQ(it->getTitle(), "article a");
+ ASSERT_TRUE(result.begin() == it);
+ it++; it++;
+ ASSERT_TRUE(it == result.end());
+}
+
+#endif // ENABLE_XAPIAN
+
+TEST(suggestion_iterator, rangeBased) {
+ TempZimArchive tza("testZim");
+
+ zim::Archive archive = tza.createZimFromContent({
+ {"article a", "item a"},
+ {"article b", "item b"},
+ {"random c", "random c"}
+ });
+
+ zim::SuggestionSearcher searcher(archive);
+ auto search = searcher.suggest("article");
+
+#if defined(ENABLE_XAPIAN)
+ search.forceRangeSuggestion(); // Close xapian db to force rangeBased search
+#endif // ENABLE_XAPIAN
+
+ ASSERT_EQ(search.getEstimatedMatches(), 2);
+ auto srs = search.getResults(0, archive.getEntryCount());
+ ASSERT_EQ(srs.size(), 2);
+
+ auto it1 = srs.begin();
+ ASSERT_EQ(it1->getTitle(), "article a");
+ ASSERT_EQ(it1.getEntry().getPath(), "dummyPatharticle a");
+
+ auto suggestionItem = *it1;
+ ASSERT_FALSE(suggestionItem.hasSnippet());
+ ASSERT_EQ(suggestionItem.getTitle(), "article a");
+
+ zim::SuggestionIterator it2 = it1;
+ ASSERT_EQ(it1->getTitle(), it2->getTitle());
+
+ it1++;
+ ASSERT_EQ(it1->getTitle(), "article b");
+ it1--;
+ ASSERT_EQ(it1->getTitle(), "article a");
+
+ it2 = it1;
+ ASSERT_TRUE(it2 == it1);
+
+ it2 = srs.end();
+ ASSERT_EQ(it2->getTitle(), "random c");
+}
+
+#if defined(ENABLE_XAPIAN)
+TEST(search_iterator, stemmedSearch) {
+ TempZimArchive tza("testZim");
+
+ // The following stemming occurs
+ // apple -> appl
+ // charlie -> charli
+ // chocolate -> chocol
+ // factory -> factori
+ zim::Archive archive = tza.createZimFromTitles({
+ "an apple a day, keeps the doctor away",
+ "charlie and the chocolate factory"
+ });
+
+ zim::SuggestionSearcher searcher(archive);
+
+ auto search = searcher.suggest("apples");
+ auto result = search.getResults(0, 1);
+ ASSERT_EQ(result.begin()->getSnippet(), "an <b>apple</b> a day, keeps the doctor away");
+
+ search = searcher.suggest("chocolate factory");
+ result = search.getResults(0, 1);
+ ASSERT_EQ(result.begin()->getSnippet(), "charlie and the <b>chocolate</b> <b>factory</b>");
+
+ // Test stemming with reused searcher
+ search = searcher.suggest("apples");
+ result = search.getResults(0, 1);
+ ASSERT_EQ(result.begin()->getSnippet(), "an <b>apple</b> a day, keeps the doctor away");
+}
+#endif // ENABLE_XAPIAN
+
+} // anonymous namespace
--- /dev/null
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "../src/template.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+class TemplateTest : public ::testing::Test, private zim::TemplateParser::Event
+{
+ public:
+ std::string result;
+ zim::TemplateParser parser;
+
+ TemplateTest() : parser(this) {}
+
+ private:
+ void onData(const std::string& data) { result += data; }
+
+ void onToken(const std::string& token)
+ {
+ result += "T(";
+ result += token;
+ result += ')';
+ }
+
+ void onLink(char ns, const std::string& title)
+ {
+ result += "L(";
+ result += ns;
+ result += ", ";
+ result += title;
+ result += ')';
+ }
+};
+
+TEST_F(TemplateTest, ZeroTemplate)
+{
+ parser.parse("<html><body><h1>Hi</h1></body></html>");
+ parser.flush();
+
+ ASSERT_EQ(result, "<html><body><h1>Hi</h1></body></html>");
+}
+
+TEST_F(TemplateTest, Token)
+{
+ parser.parse("<html><%content%></html>");
+ parser.flush();
+
+ ASSERT_EQ(result, "<html>T(content)</html>");
+}
+
+TEST_F(TemplateTest, Link)
+{
+ parser.parse("<html><%/A/Article%></html>");
+ parser.flush();
+
+ ASSERT_EQ(result, "<html>L(A, Article)</html>");
+}
+
+} // namespace
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "gtest/gtest.h"
+
+#include "../src/writer/tinyString.h"
+
+using namespace zim::writer;
+
+namespace
+{
+
+TEST(TinyStringTest, empty)
+{
+ TinyString s;
+ ASSERT_TRUE(s.empty());
+ ASSERT_EQ(s.size(), 0);
+ ASSERT_EQ((std::string)s, "");
+ ASSERT_EQ(s, TinyString());
+}
+
+TEST(TinyStringTest, noChar)
+{
+ TinyString s("");
+ ASSERT_TRUE(s.empty());
+ ASSERT_EQ(s.size(), 0);
+ ASSERT_EQ((std::string)s, "");
+ ASSERT_EQ(s, TinyString());
+}
+
+TEST(TinyStringTest, oneChar)
+{
+ TinyString s("A");
+ ASSERT_FALSE(s.empty());
+ ASSERT_EQ(s.size(), 1);
+ ASSERT_EQ((std::string)s, "A");
+ ASSERT_TRUE(s < TinyString("B"));
+ ASSERT_EQ(s, TinyString("A"));
+ ASSERT_FALSE(s == TinyString("B"));
+}
+
+TEST(TinyStringTest, chars)
+{
+ TinyString s("ABCDE");
+ ASSERT_FALSE(s.empty());
+ ASSERT_EQ(s.size(), 5);
+ ASSERT_EQ((std::string)s, "ABCDE");
+ ASSERT_FALSE(s < TinyString());
+ ASSERT_FALSE(s < TinyString(""));
+ ASSERT_FALSE(s < TinyString("A"));
+ ASSERT_FALSE(s < TinyString("ABCD"));
+ ASSERT_FALSE(s < TinyString("AACDE"));
+ ASSERT_TRUE(TinyString() < s);
+ ASSERT_TRUE(TinyString("") < s);
+ ASSERT_TRUE(TinyString("A") < s);
+ ASSERT_TRUE(TinyString("ABCD") < s);
+ ASSERT_TRUE(TinyString("AACDE") < s);
+ ASSERT_TRUE(s == s);
+ ASSERT_FALSE(s < s);
+}
+
+TEST(PathTitleTinyString, none)
+{
+ PathTitleTinyString s;
+ ASSERT_TRUE(s.empty());
+ ASSERT_EQ(s.size(), 0);
+ ASSERT_EQ((std::string)s, "");
+ ASSERT_EQ(s, TinyString());
+ ASSERT_EQ(s.getPath(), "");
+ ASSERT_EQ(s.getTitle(false), "");
+ ASSERT_EQ(s.getTitle(true), "");
+}
+
+TEST(PathTitleTinyString, empty)
+{
+ //We have the separator between path and title
+ PathTitleTinyString s("", "");
+ ASSERT_FALSE(s.empty());
+ ASSERT_EQ(s.size(), 1);
+ ASSERT_EQ((std::string)s, std::string("", 1));
+ ASSERT_EQ(s.getPath(), "");
+ ASSERT_EQ(s.getTitle(false), "");
+ ASSERT_EQ(s.getTitle(true), "");
+}
+
+TEST(PathTitleTinyString, no_title)
+{
+ //We have the separator between path and title
+ PathTitleTinyString s("FOO", "");
+ ASSERT_FALSE(s.empty());
+ ASSERT_EQ(s.size(), 4);
+ ASSERT_EQ((std::string)s, std::string("FOO\0", 4));
+ ASSERT_EQ(s.getPath(), "FOO");
+ ASSERT_EQ(s.getTitle(false), "FOO");
+ ASSERT_EQ(s.getTitle(true), "");
+}
+
+TEST(PathTitleTinyString, no_path)
+{
+ //We have the separator between path and title
+ PathTitleTinyString s("", "BAR");
+ ASSERT_FALSE(s.empty());
+ ASSERT_EQ(s.size(), 4);
+ ASSERT_EQ((std::string)s, std::string("\0BAR", 4));
+ ASSERT_EQ(s.getPath(), "");
+ ASSERT_EQ(s.getTitle(false), "BAR");
+ ASSERT_EQ(s.getTitle(true), "BAR");
+}
+
+TEST(PathTitleTinyString, path_title)
+{
+ //We have the separator between path and title
+ PathTitleTinyString s("FOO", "BAR");
+ ASSERT_FALSE(s.empty());
+ ASSERT_EQ(s.size(), 7);
+ ASSERT_EQ((std::string)s, std::string("FOO\0BAR", 7));
+ ASSERT_EQ(s.getPath(), "FOO");
+ ASSERT_EQ(s.getTitle(false), "BAR");
+ ASSERT_EQ(s.getTitle(true), "BAR");
+}
+
+TEST(PathTitleTinyString, equal_path_title)
+{
+ //We have the separator between path and title
+ PathTitleTinyString s("FOO", "FOO");
+ ASSERT_FALSE(s.empty());
+ ASSERT_EQ(s.size(), 4);
+ ASSERT_EQ((std::string)s, std::string("FOO\0", 4));
+ ASSERT_EQ(s.getPath(), "FOO");
+ ASSERT_EQ(s.getTitle(false), "FOO");
+ ASSERT_EQ(s.getTitle(true), "");
+}
+} // namespace
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "tools.h"
+
+#ifdef _WIN32
+#include <locale>
+#include <codecvt>
+#include <windows.h>
+#include <fileapi.h>
+#include <io.h>
+#else
+#include <dirent.h>
+#endif
+
+#include "../src/fs.h"
+
+#include <fcntl.h>
+#include <sys/stat.h>
+#include "gtest/gtest.h"
+
+namespace zim
+{
+
+namespace unittests
+{
+
+TempFile::TempFile(const char* name)
+ : fd_(-1)
+{
+#ifdef _WIN32
+ std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>> utfConv;
+ wchar_t cbase[MAX_PATH];
+ const std::wstring wname = utfConv.from_bytes(name);
+ GetTempPathW(MAX_PATH-(wname.size()+2), cbase);
+ //This create a empty file, we just have to open it later
+ GetTempFileNameW(cbase, wname.c_str(), 0, wpath_);
+ path_ = utfConv.to_bytes(wpath_);
+#else
+ const char* const TMPDIR = std::getenv("TMPDIR");
+ const std::string tmpdir(TMPDIR ? TMPDIR : "/tmp");
+ path_ = tmpdir + "/" + name + "_XXXXXX";
+ auto tmp_fd = mkstemp(&path_[0]);
+ ::close(tmp_fd);
+#endif
+}
+
+TempFile::~TempFile()
+{
+ close();
+#ifdef _WIN32
+ DeleteFileW(wpath_);
+#else
+ unlink(path_.c_str());
+#endif
+}
+
+int TempFile::fd()
+{
+ if (fd_ == -1) {
+#ifdef _WIN32
+ fd_ = _wopen(wpath_, _O_RDWR | _O_BINARY);
+#else
+ fd_ = open(path_.c_str(), O_RDWR);
+#endif
+ }
+ return fd_;
+}
+
+void TempFile::close()
+{
+ if (fd_ != -1) {
+ ::close(fd_);
+ fd_ = -1;
+ }
+}
+
+std::unique_ptr<TempFile>
+makeTempFile(const char* name, const std::string& content)
+{
+ std::unique_ptr<TempFile> p(new TempFile(name));
+ write(p->fd(), &content[0], content.size());
+ p->close();
+ return p;
+}
+
+void setDataDir(std::string& dataDir)
+{
+ // FAIL must be used in a void function. So we need to use a out parameter.
+ const char* cDataDir = std::getenv("ZIM_TEST_DATA_DIR");
+ if (cDataDir == NULL) {
+ dataDir = "INVALID_DATA_DIR";
+ FAIL() << "ZIM_TEST_DATA_DIR is not defined. You must define it to the directory containing test zim files.";
+ }
+ dataDir = cDataDir;
+}
+
+TestFile::TestFile(const std::string& dataDir, const std::string& category, const std::string& filename) :
+ filename(filename),
+ category(category),
+ path(zim::DEFAULTFS::join(zim::DEFAULTFS::join(dataDir, category), filename))
+{
+}
+
+const std::vector<TestFile> getDataFilePath(const std::string& filename, const std::string& category)
+{
+ std::vector<TestFile> filePaths;
+ std::string dataDirPath;
+ setDataDir(dataDirPath);
+
+ if (!category.empty()) {
+ // We have asked for a particular category.
+ filePaths.emplace_back(dataDirPath, category, filename);
+ } else {
+#ifdef _WIN32
+ // We don't have dirent.h in windows.
+ // If we move to test data out of the repository, we will need a way to discover the data.
+ // Use a static list of categories for now.
+ for (auto& category: {"withns", "nons"}) {
+ filePaths.emplace_back(dataDirPath, category, filename);
+ }
+#else
+ auto dataDir = opendir(dataDirPath.c_str());
+
+ if (!dataDir) {
+ filePaths.emplace_back(dataDirPath, "NO_DATA_DIR", filename);
+ return filePaths;
+ }
+ struct dirent* current = NULL;
+ while((current = readdir(dataDir))) {
+ if (current->d_name[0] == '.' || current->d_name[0] == '_') {
+ continue;
+ }
+ filePaths.emplace_back(dataDirPath, current->d_name, filename);
+ }
+ closedir(dataDir);
+#endif
+ }
+
+ return filePaths;
+}
+
+zim::Archive TempZimArchive::createZimFromTitles(std::vector<std::string> titles) {
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(this->path());
+
+ // add dummy items with given titles
+ for (auto title : titles) {
+ std::string path = "dummyPath" + title;
+ auto item = std::make_shared<TestItem>(path, "text/html", title);
+ creator.addItem(item);
+ }
+
+ creator.addMetadata("Title", "This is a title");
+
+ creator.finishZimCreation();
+ return zim::Archive(this->path());
+}
+
+zim::Archive TempZimArchive::createZimFromContent(std::vector<std::vector<std::string>> contents) {
+ zim::writer::Creator creator;
+ creator.configIndexing(true, "en");
+ creator.startZimCreation(this->path());
+
+ // add dummy items with given titles
+ for (auto content : contents) {
+ std::string path = "dummyPath" + content[0];
+ auto item = std::make_shared<TestItem>(path, "text/html", content[0], content[1]);
+ creator.addItem(item);
+ }
+
+ creator.addMetadata("Title", "This is a title");
+
+ creator.finishZimCreation();
+ return zim::Archive(this->path());
+}
+
+const std::string TempZimArchive::getPath() {
+ return this->path();
+}
+
+} // namespace unittests
+
+} // namespace zim
--- /dev/null
+/*
+ * Copyright (C) 2020 Veloman Yunkan
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_TEST_TOOLS_H
+#define ZIM_TEST_TOOLS_H
+
+
+#include <string>
+#include <vector>
+#include <sys/types.h>
+#ifdef _WIN32
+#include <windows.h>
+#include <io.h>
+#define LSEEK _lseeki64
+#else
+#include <unistd.h>
+#define LSEEK lseek
+#endif
+
+#include "../src/buffer.h"
+#include <limits.h>
+
+#define ZIM_PRIVATE
+#include <zim/archive.h>
+#include <zim/search.h>
+#include <zim/writer/creator.h>
+#include <zim/writer/item.h>
+#include <zim/writer/contentProvider.h>
+
+namespace zim
+{
+
+namespace unittests
+{
+
+// TempFile is a utility class for working with temporary files in RAII fashion:
+//
+// 1. An empty temporary file is created (in the temporary file directory)
+// by the constructor.
+//
+// 2. The file can be filled with data via the file descriptor (returned
+// by the fd() member function).
+//
+// -------------------------------------------------------------
+// | IMPORTANT! |
+// | |
+// | The file descriptor must NOT be close()-ed. Under Windows |
+// | this will result in the file being removed. |
+// -------------------------------------------------------------
+//
+// 3. The destructor automatically (closes and) removes the file
+//
+class TempFile
+{
+ int fd_;
+ std::string path_;
+#ifdef _WIN32
+ wchar_t wpath_[MAX_PATH];
+#endif
+public:
+ // Creates an empty file in the temporary directory (under Linux and friends
+ // its path is read from the TMPDIR environment variable or defaults to /tmp)
+ explicit TempFile(const char* name);
+
+ TempFile(const TempFile& ) = delete;
+ void operator=(const TempFile& ) = delete;
+
+ // Closes and removes the file
+ ~TempFile();
+
+ // Close the file descriptor if opened
+ void close();
+
+ // File descriptor
+ // Important! It must NOT be close()-ed
+ int fd();
+
+ // Absolute path of the file
+ std::string path() const { return path_; }
+};
+
+template<typename T>
+std::string to_string(const T& value)
+{
+ std::ostringstream ss;
+ ss << value;
+ return ss.str();
+}
+
+std::unique_ptr<TempFile>
+makeTempFile(const char* name, const std::string& content);
+
+
+template<typename T>
+zim::Buffer write_to_buffer(const T& object, const std::string& tail="")
+{
+ TempFile tmpFile("test_temp_file");
+ const auto tmp_fd = tmpFile.fd();
+ object.write(tmp_fd);
+ write(tmp_fd, tail.data(), tail.size());
+ size_type size = LSEEK(tmp_fd, 0, SEEK_END);
+
+ auto buf = zim::Buffer::makeBuffer(zim::zsize_t(size));
+ LSEEK(tmp_fd, 0, SEEK_SET);
+ char* p = const_cast<char*>(buf.data());
+ while ( size != 0 ) {
+ const auto size_to_read = std::min(size, size_type{1024*1024});
+ const auto n = read(tmp_fd, p, size_to_read);
+ if ( n == -1 )
+ throw std::runtime_error("Cannot read " + tmpFile.path());
+ p += n;
+ size -= n;
+ }
+ return buf;
+}
+
+struct TestFile {
+ TestFile(const std::string& dataDir, const std::string& category, const std::string& filename);
+ const std::string filename;
+ const std::string category;
+ const std::string path;
+};
+
+const std::vector<TestFile> getDataFilePath(const std::string& filename, const std::string& category = "");
+
+// Helper class to create temporary zim and remove it once the test is done
+class TempZimArchive : zim::unittests::TempFile {
+ public:
+ explicit TempZimArchive(const char* tempPath) : zim::unittests::TempFile {tempPath} {}
+ zim::Archive createZimFromTitles(std::vector<std::string> titles);
+ zim::Archive createZimFromContent(std::vector<std::vector<std::string>> contents);
+ const std::string getPath();
+};
+
+enum class IsFrontArticle {
+ YES,
+ NO,
+ DEFAULT
+};
+
+class TestItem : public zim::writer::Item {
+ public:
+ TestItem(
+ const std::string& path,
+ const std::string& mimetype = "text/html",
+ const std::string& title = "Test Item",
+ const std::string& content = "foo",
+ IsFrontArticle frontArticle = IsFrontArticle::DEFAULT) :
+ path(path),
+ title(title),
+ content(content),
+ mimetype(mimetype),
+ frontArticle(frontArticle)
+ {}
+ virtual ~TestItem() = default;
+
+ virtual std::string getPath() const { return path; };
+ virtual std::string getTitle() const { return title; };
+ virtual std::string getMimeType() const { return mimetype; };
+ virtual zim::writer::Hints getHints() const {
+ switch (frontArticle) {
+ case IsFrontArticle::YES:
+ return zim::writer::Hints{{zim::writer::FRONT_ARTICLE, 1}};
+ case IsFrontArticle::NO:
+ return zim::writer::Hints{{zim::writer::FRONT_ARTICLE, 0}};
+ default:
+ return zim::writer::Hints();
+ }
+ }
+
+ virtual std::unique_ptr<zim::writer::ContentProvider> getContentProvider() const {
+ return std::unique_ptr<zim::writer::ContentProvider>(new zim::writer::StringProvider(content));
+ }
+
+ std::string path;
+ std::string title;
+ std::string content;
+ std::string mimetype;
+ IsFrontArticle frontArticle;
+};
+
+} // namespace unittests
+
+} // namespace zim
+
+#endif // ZIM_TEST_TOOLS_H
--- /dev/null
+/*
+ * Copyright (C) 2021 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include "../src/tools.h"
+
+#include "gtest/gtest.h"
+
+namespace {
+ TEST(Tools, wordCount) {
+ ASSERT_EQ(zim::countWords(""), 0);
+ ASSERT_EQ(zim::countWords(" "), 0);
+ ASSERT_EQ(zim::countWords("One"), 1);
+ ASSERT_EQ(zim::countWords("One Two Three"), 3);
+ ASSERT_EQ(zim::countWords(" One "), 1);
+ ASSERT_EQ(zim::countWords("One Two Three "), 3);
+ ASSERT_EQ(zim::countWords("One.Two\tThree"), 2);
+ }
+
+
+ TEST(Tools, parseIllustrationPathToSize) {
+ ASSERT_EQ(zim::parseIllustrationPathToSize("Illustration_0x0@1"), 0);
+ ASSERT_EQ(zim::parseIllustrationPathToSize("Illustration_1x1@1"), 1);
+ ASSERT_EQ(zim::parseIllustrationPathToSize("Illustration_01x01@1"), 1);
+ ASSERT_EQ(zim::parseIllustrationPathToSize("Illustration_64x64@1"), 64);
+ ASSERT_EQ(zim::parseIllustrationPathToSize("Illustration_128x128@1"), 128);
+ ASSERT_EQ(zim::parseIllustrationPathToSize("Illustration_1024x1024@1"), 1024);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illsration_64x64@1"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illstration_"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_64x@1"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_64x"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_64x64"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_64x64@1.5"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_128x64@1"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_-32x-32@1"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_ 64x64@1"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_64x 64@1"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_ 64x 64@1"), std::runtime_error);
+ ASSERT_THROW(zim::parseIllustrationPathToSize("Illustration_1 28x1 28@1"), std::runtime_error);
+ }
+
+}
--- /dev/null
+/*
+ * Copyright (C) 2013 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT. See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ */
+
+#include <zim/uuid.h>
+#include <iostream>
+#include <sstream>
+
+#include "gtest/gtest.h"
+#ifdef _WIN32
+# include <windows.h>
+# include <synchapi.h>
+#else
+# include <unistd.h>
+#endif
+
+namespace
+{
+TEST(UuidTest, construct)
+{
+ zim::Uuid uuid1(
+ "\x01\x23\x45\x67\x89\xab\xcd\xef\x10\x32\x54\x76\x98\xba\xdc\xfe");
+ zim::Uuid uuid2(
+ "\x01\x23\x45\x67\x89\xab\xcd\xe0\x10\x32\x54\x76\x98\xba\xdc\x0e");
+
+ ASSERT_TRUE(uuid1 != uuid2);
+ ASSERT_TRUE(uuid1 != zim::Uuid());
+ ASSERT_TRUE(uuid2 != zim::Uuid());
+
+ ASSERT_EQ(uuid1.data[0], '\x01');
+ ASSERT_EQ(uuid1.data[1], '\x23');
+ ASSERT_EQ(uuid1.data[2], '\x45');
+ ASSERT_EQ(uuid1.data[3], '\x67');
+ ASSERT_EQ(uuid1.data[4], '\x89');
+ ASSERT_EQ(uuid1.data[5], '\xab');
+ ASSERT_EQ(uuid1.data[6], '\xcd');
+ ASSERT_EQ(uuid1.data[7], '\xef');
+ ASSERT_EQ(uuid1.data[8], '\x10');
+ ASSERT_EQ(uuid1.data[9], '\x32');
+ ASSERT_EQ(uuid1.data[10], '\x54');
+ ASSERT_EQ(uuid1.data[11], '\x76');
+ ASSERT_EQ(uuid1.data[12], '\x98');
+ ASSERT_EQ(uuid1.data[13], '\xba');
+ ASSERT_EQ(uuid1.data[14], '\xdc');
+ ASSERT_EQ(uuid1.data[15], '\xfe');
+
+ ASSERT_EQ(uuid2.data[0], '\x01');
+ ASSERT_EQ(uuid2.data[1], '\x23');
+ ASSERT_EQ(uuid2.data[2], '\x45');
+ ASSERT_EQ(uuid2.data[3], '\x67');
+ ASSERT_EQ(uuid2.data[4], '\x89');
+ ASSERT_EQ(uuid2.data[5], '\xab');
+ ASSERT_EQ(uuid2.data[6], '\xcd');
+ ASSERT_EQ(uuid2.data[7], '\xe0');
+ ASSERT_EQ(uuid2.data[8], '\x10');
+ ASSERT_EQ(uuid2.data[9], '\x32');
+ ASSERT_EQ(uuid2.data[10], '\x54');
+ ASSERT_EQ(uuid2.data[11], '\x76');
+ ASSERT_EQ(uuid2.data[12], '\x98');
+ ASSERT_EQ(uuid2.data[13], '\xba');
+ ASSERT_EQ(uuid2.data[14], '\xdc');
+ ASSERT_EQ(uuid2.data[15], '\x0e');
+}
+
+TEST(UuidTest, generate)
+{
+ zim::Uuid uuid1;
+ zim::Uuid uuid2;
+ ASSERT_TRUE(uuid1 == uuid2);
+ ASSERT_TRUE(uuid1 == zim::Uuid());
+ ASSERT_TRUE(uuid2 == zim::Uuid());
+
+ uuid1 = zim::Uuid::generate();
+ ASSERT_TRUE(uuid1 != uuid2);
+ ASSERT_TRUE(uuid1 != zim::Uuid());
+ ASSERT_TRUE(uuid2 == zim::Uuid());
+
+ // Since GNU Mach's clock isn't precise hence the time might be
+ // same during generating uuid1 and uuid2 leading to test
+ // failure. To bring the time difference between 2 sleep for a
+ // second. Thanks to Pino Toscano.
+#ifdef _WIN32
+ Sleep(1000);
+#else
+ sleep(1);
+#endif
+
+ uuid2 = zim::Uuid::generate();
+ ASSERT_TRUE(uuid1 != uuid2);
+ ASSERT_TRUE(uuid1 != zim::Uuid());
+ ASSERT_TRUE(uuid2 != zim::Uuid());
+}
+
+TEST(UuidTest, output)
+{
+ zim::Uuid uuid(
+ "\x55\x0e\x84\x00\xe2\x9b\x41\xd4\xa7\x16\x44\x66\x55\x44\x00\x00");
+ std::ostringstream out;
+ out << uuid;
+ std::string s = out.str();
+ ASSERT_EQ(s, "550e8400-e29b-41d4-a716-446655440000");
+ ASSERT_EQ((std::string)uuid, "550e8400-e29b-41d4-a716-446655440000");
+}
+};