Import zimlib_6.1.3.orig.tar.gz

author Kunal Mehta <legoktm@debian.org>

Wed, 20 May 2020 20:51:33 +0000 (21:51 +0100)

committer Kunal Mehta <legoktm@debian.org>

Wed, 20 May 2020 20:51:33 +0000 (21:51 +0100)
author Kunal Mehta <legoktm@debian.org>
Wed, 20 May 2020 20:51:33 +0000 (21:51 +0100)
committer Kunal Mehta <legoktm@debian.org>
Wed, 20 May 2020 20:51:33 +0000 (21:51 +0100)
diff --git a/.codecov.yml b/.codecov.yml

new file mode 100644 (file)

index 0000000..21288b7
--- /dev/null
+++ b/.codecov.yml
@@ -0,0 +1,17 @@
+codecov:
+  notify:
+    require_ci_to_pass: yes
+
+coverage:
+  status:
+    project:
+      default:
+        threshold: 1%
+    patch:
+      default:
+        target: 90%
+        threshold: 0%
+
+ignore:
+  - "test"
+  - "examples"
diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml

new file mode 100644 (file)

index 0000000..f39dc2a
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,12 @@
+# These are supported funding model platforms
+
+github: kiwix # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+custom: # https://kiwix.org/support-us/
diff --git a/.github/move.yml b/.github/move.yml

new file mode 100644 (file)

index 0000000..3e1491a
--- /dev/null
+++ b/.github/move.yml
@@ -0,0 +1,27 @@
+# Configuration for Move Issues - https://github.com/dessant/move-issues
+
+# Delete the command comment when it contains no other content
+deleteCommand: true
+
+# Close the source issue after moving
+closeSourceIssue: true
+
+# Lock the source issue after moving
+lockSourceIssue: false
+
+# Mention issue and comment authors
+mentionAuthors: true
+
+# Preserve mentions in the issue content
+keepContentMentions: true
+
+# Move labels that also exist on the target repository
+moveLabels: true
+
+# Set custom aliases for targets
+# aliases:
+#   r: repo
+#   or: owner/repo
+
+# Repository to extend settings from
+# _extends: repo
+\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml

new file mode 100644 (file)

index 0000000..8c49559
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,156 @@
+name: CI
+
+on: [push]
+
+jobs:
+  Macos:
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - native_dyn
+          - iOS_arm64
+          - iOS_i386
+          - iOS_x86_64
+          - iOS_armv7
+    runs-on: macos-latest
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v1
+      - name: Setup python 3.5
+        uses: actions/setup-python@v1
+        with:
+          python-version: '3.6'
+      - name: Install packages
+        uses: mstksg/get-package@v1
+        with:
+          brew: gcovr pkg-config ninja
+      - name: Install python modules
+        run: pip3 install meson==0.52.1 pytest
+      - name: Install deps
+        shell: bash
+        run: |
+          ARCHIVE_NAME=deps2_osx_${{matrix.target}}_libzim.tar.xz
+          wget -O- http://tmp.kiwix.org/ci/${ARCHIVE_NAME} | tar -xJ -C $HOME
+      - name: Compile
+        shell: bash
+        run: |
+          MESON_OPTION="--default-library=shared"
+          MESON_CROSSFILE="$HOME/BUILD_${{matrix.target}}/meson_cross_file.txt"
+          if [[ ! "${{matrix.target}}" =~ native_.* ]]; then
+            MESON_OPTION="$MESON_OPTION -Db_bitcode=true --cross-file $MESON_CROSSFILE"
+            cat $MESON_CROSSFILE
+          fi
+          export PKG_CONFIG_PATH=$HOME/BUILD_${{matrix.target}}/INSTALL/lib/pkgconfig
+          meson . build ${MESON_OPTION}
+          cd build
+          ninja
+      - name: Test
+        if: startsWith(matrix.target, 'native_')
+        shell: bash
+        run: |
+          export LD_LIBRARY_PATH=$HOME/BUILD_${{matrix.target}}/INSTALL/lib:$HOME/BUILD_${{matrix.target}}/INSTALL/lib64
+          cd build
+          meson test --verbose
+        env:
+          SKIP_BIG_MEMORY_TEST: 1
+
+  Linux:
+    strategy:
+      fail-fast: false
+      matrix:
+        target:
+          - native_static
+          - native_dyn
+          - android_arm
+          - android_arm64
+          - win32_static
+          - win32_dyn
+        include:
+          - target: native_static
+            image_variant: xenial
+            lib_postfix: '/x86_64-linux-gnu'
+          - target: native_dyn
+            image_variant: xenial
+            lib_postfix: '/x86_64-linux-gnu'
+          - target: android_arm
+            image_variant: xenial
+            lib_postfix: '/x86_64-linux-gnu'
+          - target: android_arm64
+            image_variant: xenial
+            lib_postfix: '/x86_64-linux-gnu'
+          - target: win32_static
+            image_variant: f31
+            lib_postfix: '64'
+          - target: win32_dyn
+            image_variant: f31
+            lib_postfix: '64'
+    env:
+      HOME: /home/runner
+    runs-on: ubuntu-latest
+    container:
+      image: "kiwix/kiwix-build_ci:${{matrix.image_variant}}-26"
+    steps:
+    - name: Extract branch name
+      shell: bash
+      run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})"
+      id: extract_branch
+    - name: Checkout code
+      shell: python
+      run: |
+        from subprocess import check_call
+        from os import environ
+        command = [
+          'git', 'clone',
+          'https://github.com/${{github.repository}}',
+          '--depth=1',
+          '--branch', '${{steps.extract_branch.outputs.branch}}'
+        ]
+        check_call(command, cwd=environ['HOME'])
+    - name: Install deps
+      shell: bash
+      run: |
+        ARCHIVE_NAME=deps2_${OS_NAME}_${{matrix.target}}_libzim.tar.xz
+        wget -O- http://tmp.kiwix.org/ci/${ARCHIVE_NAME} | tar -xJ -C /home/runner
+    - name: Compile
+      shell: bash
+      run: |
+        if [[ "${{matrix.target}}" =~ .*_dyn ]]; then
+          MESON_OPTION="--default-library=shared"
+        else
+          MESON_OPTION="--default-library=static"
+        fi
+        if [[ "${{matrix.target}}" =~ native_.* ]]; then
+          MESON_OPTION="$MESON_OPTION -Db_coverage=true"
+        else
+          MESON_OPTION="$MESON_OPTION --cross-file $HOME/BUILD_${{matrix.target}}/meson_cross_file.txt"
+        fi
+        if [[ "${{matrix.target}}" =~ android_.* ]]; then
+          MESON_OPTION="$MESON_OPTION -Dandroid=true"
+        fi
+        cd $HOME/libzim
+        meson . build ${MESON_OPTION}
+        cd build
+        ninja
+      env:
+        PKG_CONFIG_PATH: "/home/runner/BUILD_${{matrix.target}}/INSTALL/lib/pkgconfig:/home/runner/BUILD_${{matrix.target}}/INSTALL/lib${{matrix.lib_postfix}}/pkgconfig"
+    - name: Test
+      if: startsWith(matrix.target, 'native_')
+      shell: bash
+      run: |
+        cd $HOME/libzim/build
+        meson test --verbose
+        ninja coverage
+      env:
+        LD_LIBRARY_PATH: "/home/runner/BUILD_${{matrix.target}}/INSTALL/lib:/home/runner/BUILD_${{matrix.target}}/INSTALL/lib${{matrix.lib_postfix}}"
+        SKIP_BIG_MEMORY_TEST: 1
+    - name: Publish coverage
+      shell: bash
+      run: |
+        cd $HOME/libzim
+        curl https://codecov.io/bash -o codecov.sh
+        bash codecov.sh -n "${OS_NAME}_${{matrix.target}}" -Z
+        rm codecov.sh
+      if: startsWith(matrix.target, 'native_')
+      env:
+        CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
diff --git a/.gitignore b/.gitignore

new file mode 100644 (file)

index 0000000..1d89127
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,34 @@
+*~
+*#*
+autom4te.cache
+build
+compile
+config.h
+configure
+depcomp
+.deps
+.dirstamp
+INSTALL
+install-sh
+*.kate-swp
+*.la
+.libs
+libtool
+*.lo
+ltmain.sh
+*.m4
+Makefile
+Makefile.in
+missing
+*.o
+stamp-h1
+.svn
+.*.swp
+*.zim
+examples/createZimExample
+src/tools/zimdump
+src/tools/zimsearch
+libzim.pc
+test-driver
+test/zimlib-test*
+test/test-suite.log
diff --git a/AUTHORS b/AUTHORS

new file mode 100644 (file)

index 0000000..1197f56
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+Tommi Maekitalo <tommi@tntnet.org>
diff --git a/COPYING b/COPYING

new file mode 100644 (file)

index 0000000..e2683b5
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,280 @@
+                   GNU GENERAL PUBLIC LICENSE
+                      Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+     51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+                           Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+\f
+                   GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+\f
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+\f
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+\f
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+                           NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+                    END OF TERMS AND CONDITIONS
diff --git a/ChangeLog b/ChangeLog

new file mode 100644 (file)

index 0000000..ad2b723
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,260 @@
+libzim 6.1.3
+============
+
+ * [Writer] Use a `.tmp` suffix and rename to `.zim` at the end of the write
+ proces.
+ * Add unit tests
+ * Do not include uncessary `windows.h` headers in public zim's headers.
+
+libzim 6.1.2
+============
+
+ * [CI] Fix codecov configuration
+ * [Writer] Fix threads synchronization at end of writing process.
+
+libzim 6.1.1
+============
+
+ * Fix bug around the find function
+
+libzim 6.1.0
+============
+
+ * Compile now on OpenBSD
+ * [Test] Use the main function provided by gtest.
+ * [CI] Move the CI compilation to github actions.
+ * Add stopwords for 54 new languages.
+ * [Writer] Improve the way we are writing cluster at zim creation time.
+   - Clusters are directly written in the zim file instead of using temporary
+     files.
+   - mimetypes are limited to 944 bytes.
+ * Add a new type of iterator to iterate over articles in a performant way
+   reducing decompression of clusters. This is now the new default iterator.
+ * Add support for zim files compressed with zstd compression algorithm.
+   This is not possible to use zstd to create zim file for now.
+
+libzim 6.0.2
+============
+
+ * Fix search suggestion parsing.
+
+libzim 6.0.1
+============
+
+ * Fix crash when trying to open an empty file.
+ * Ensure that pytest tests are run on the CI.
+
+libzim 6.0.0
+============
+
+ * [Writer] Index the articles in differents threads. This is a huge speed
+   improvement as the main thread in not blocked by indexing.
+ * Index the title only if `shouldIndex` return true.
+
+libzim 5.1.0
+============
+
+ * Improve indexation of the title.
+ * Better pertinence of suggestions (only for new zim files)
+ * Improvement of the speed of Leveinstein distance for suggestions (for old
+   zims)
+
+libzim 5.0.2
+============
+
+ * Improve README.
+ * Remove gtest as embeded subproject.
+ * Better lzma compression.
+ * Better performance of the leveinstein algorithm (better suggestions
+   performance)
+
+libzim 5.0.1
+============
+
+ * Update README.
+ * [Writer] Add debug information (print progress of the clusters writing).
+ * [Writer] Correctly print the url to the user.
+ * [CI] Add code coverage.
+
+libzim 5.0.0
+============
+
+ * Fix thread slipping for win32 crosscompilation.
+ * Fix a potential invalid access when reading dirent.
+ * Fix memory leak in the decompression algorithm.
+ * [Writer] Fix a memory leak (cluster cleanning)
+ * [Writer] Write article data in a temporary cluster file instead of a
+   temporary file per article.
+ * [Writer] Better algorithm to store the dirent while creating the zim
+   file. Better memory usage.
+ * [Writer] [API Change] Url/Ns are now handle using the same struct Url.
+ * [Writer] [API Change] No more aid and redirectAid. A redirectArticle
+   have to implement redirectUrl.
+ * [Writer] Use a memory pool to avoid multiple small memory allocations.
+ * [Writer] [API Change] Rename `ZimCreator` to `Creator`.
+ * [API Change] File's `search` and `suggestions` now return a unique_ptr
+   instead of a raw pointer.
+
+libzim 4.0.7
+============
+
+ * Build libzim without rpath.
+
+libzim 4.0.6
+============
+
+ * Support zim file created with cluster not written sequentially.
+ * Remove a meson warning.
+
+libzim 4.0.5
+============
+
+ * Store the xapian database in the right url.
+ * Do not fail when reading very small zim file (<256b).
+ * Do not print message on normal behavior.
+ * [BUILDSYSTEM] Be able to build a dynamic lib (libzim.so) but using static
+   dependencies.
+ * [CI] Use last version of meson.
+ * [CI] Use the new deps archive xz
+
+libzim 4.0.4
+============
+
+ * Fix opening of multi-part zim.
+ * Fix convertion of path to wpath on Windows.
+
+libzim 4.0.3
+============
+
+ * Implement low level file manipilation using different backends
+
+libzim 4.0.2
+============
+
+ * [Windows] Fix opening of zim file bigger than 4GiB
+
+libzim 4.0.1
+============
+
+ * [Writer] Fix wrong redirectyon log message
+ * Make libzim compile natively on windows using MSVC
+ * Better message when failing to read a zim file.
+ * Make libzim on windows correctly open unicode path.
+ * Add compilation option to use less memory (but more I/O).
+   Usefull on low memory devices (android)
+ * Small fixes
+
+libzim 4.0.0
+============
+
+ * [Writer] Remove a lot of memory copy.
+ * [Writer] Add xapian indexing directly in libzim.
+ * [Writer] Better API.
+ * [Writer] Use multi-threading to write clusters.
+ * [Writer] Ensure mimetype of articles article is not null.
+ * Extend test timeout for cluster's test.
+ * Less memory copy for cluster's test.
+ * Allow skipping test using a lot memory using env variable
+   `SKIP_BIG_MEMORY_TEST=1`
+ * Explicitly use the icu namespace to allow using of packaged icu lib.
+ * Use a temporary file name as long as the ZIM writting process is
+ not finished (#163)
+ * [Travis] Do no compile using gcc-5 (but the default trusty's one 4.8)
+
+libzim 3.3.0
+============
+
+ * Fix handling of big cluster (>4GiB) on 32 bits architecture. This is mainly
+ done by :
+   * Do not mmap the whole cluster by default.
+   * MMap only the memory asociated to an article.
+   * If an article is > 4GiB, the blob associated to it is invalid
+     (data==size==0).
+   * Other information are still valid (directAccessInformation, ...)
+ * Fix writing of extended cluster in writer.
+ * Compile libzim on macos.
+ * Build libzim setting RPATH.
+ * Search result urls are now what is stored in the zim file. They should not
+   start with a `/`. This is a revert of the change made in last release.
+   (See kiwix/kiwix-lib#123)
+ * Spelling corrections in README.
+
+libzim 3.2.0
+============
+
+ * Support geo query if the xapian database has indexed localisation.
+ * Handle articles bigger than 4Go in the zim file (#110).
+ * Use AND operator between search term.
+ * Fix compilation with recent clang (#95).
+ * Add method to get article's data localisation in the zim file.
+ * Be able to get only a part of article (#77).
+ * Do not crash if we cannot open the xapian Database for some reasons.
+   (kiwix/kiwix-tools#153)
+ * Do not assumen there is always a checksum in the zim file.
+   (kiwix/kiwix-tools#150)
+ * Try to do some sanity checks when opening a zim file.
+ * Use pytest to do some tests (when cython is available).
+ * Use levenshtein distance to sort and have better suggestion results.
+ * Search result urls are now always absolute (starts with a '/').
+   (kiwix/kiwix-lib#110)
+ * Open the file readonly when checking the zim file (and so be able to check
+   read only file).
+ * Accept absolute url starting with '/' when searching for article.
+ * Fix various bugs
+
+libzim 3.1.0
+============
+
+ * Lzma is not a optional dependency anymore.
+ * Better handle (report and not crash) invalid zim file.
+ * Embed source of gtest (used only if gtest is not available on the system)
+ * Move zimDump tools out of libzim repository to zim-tools
+ * ZimCreator tools doesn't not read command line to set options.
+
+libzim 3.0.0
+============
+
+This is a major change of the libzim.
+Expect a lot new improvement and API changes.
+
+ * Add a suggestion mode to the search
+ * Fix licensing issues
+ * Fix wrong stemming of the query when searching
+ * Deactivate searching (and so crash) in the embedded database if the zim is
+   splitted
+ * Rewrite the low level memory management of libzim when reading a zim file:
+    * We use a buffer base entity to handle memory and reading file instead of
+      reading file using stream.
+    * MMap the memory when posible to avoid memory copy.
+    * Use const when posible (API break)
+ * Move to googletest instead of cxxtools for unit-tests.
+ * Fix endiannes bug on arm.
+ * Do not install private headers. Those headers declare private structure and
+   should not be visible (API break)
+ * Compile libzim with `-Werror` and `-Wall` options.
+ * Make libzim thread safe for reading article.
+   The search part is not thread safe, and all search operation must be
+   protected by a lock.
+ * Add method to get only a part of a article.
+ * Move some tools to zim-tools repository.
+
+
+libzim 2.0.0
+============
+
+ * Move to meson build system
+   `libzim` now use `meson` as build system instead of `autotools`
+ * Move to C++11 standard.
+ * Fulltext search in zim file.
+   We have integrated the xapian fulltext search in libzim.
+   So now, libzim provide an API to search in a zim containing embeded fulltext
+   index. This means that :
+    *libzim need xapian as (optional) dependencies (if you want compile with
+     xapian support).
+    * The old and unused search API has been removed.
+ * Remove bzip2 support.
+ * Remove Symbian support.
+ * Few API hanges
+   * Make some header files private (not installed);
+   * A `Blob` can now be cast to a `string` directly;
+   * Change a lot of `File` methods to const methods.
diff --git a/README.md b/README.md

new file mode 100644 (file)

index 0000000..5f2e7f2
--- /dev/null
+++ b/README.md
@@ -0,0 +1,147 @@
+ZIM library
+===========
+
+The ZIM library is the reference implementation for the ZIM file
+format. It's a solution to read and write ZIM files on many systems
+and architectures. More information about the ZIM format and the
+openZIM project at https://openzim.org/.
+
+[![latest release](https://img.shields.io/github/v/tag/openzim/libzim?label=latest%20release&sort=semver)](https://download.openzim.org/release/libzim/)
+[![Build Status](https://github.com/openzim/libzim/workflows/CI/badge.svg?query=branch%3Amaster)](https://github.com/openzim/libzim/actions?query=branch%3Amaster)
+[![codecov](https://codecov.io/gh/openzim/libzim/branch/master/graph/badge.svg)](https://codecov.io/gh/openzim/libzim)
+[![CodeFactor](https://www.codefactor.io/repository/github/openzim/libzim/badge)](https://www.codefactor.io/repository/github/openzim/libzim)
+[![License: GPL v2](https://img.shields.io/badge/License-GPL%20v2-blue.svg)](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html)
+
+Disclaimer
+----------
+
+This document assumes you have a little knowledge about software
+compilation. If you experience difficulties with the dependencies or
+with the ZIM library compilation itself, we recommend to have a look
+to [kiwix-build](https://github.com/kiwix/kiwix-build).
+
+Preamble
+--------
+
+Although the ZIM library can be compiled/cross-compiled on/for many
+systems, the following documentation explains how to do it on POSIX
+ones. It is primarily though for GNU/Linux systems and has been tested
+on recent releases of Ubuntu and Fedora.
+
+Dependencies
+------------
+
+The ZIM library relies on many third parts software libraries. They
+are prerequisites to the Kiwix library compilation. Following
+libraries need to be available:
+
+* [Z](https://zlib.net/) (package `zlib1g-dev` on Ubuntu)
+* [LZMA](https://tukaani.org/lzma/) (package `liblzma-dev` on Ubuntu)
+* [ICU](http://site.icu-project.org/) (package `libicu-dev` on Ubuntu)
+* [Zstd](https://facebook.github.io/zstd/) (package `libzstd-dev` on Ubuntu)
+* [Xapian](https://xapian.org/) - optional (package `libxapian-dev` on Ubuntu)
+* [UUID](http://e2fsprogs.sourceforge.net/) (package `uuid-dev` on Ubuntu)
+* [Google Test](https://github.com/google/googletest) - optional (package `googletest` on Ubuntu)
+
+These dependencies may or may not be packaged by your operating
+system. They may also be packaged but only in an older version. The
+compilation script will tell you if one of them is missing or too old.
+In the worse case, you will have to download and compile a more recent
+version by hand.
+
+If you want to install these dependencies locally, then ensure that
+meson (through `pkg-config`) will properly find them.
+
+Environment
+-------------
+
+The ZIM library builds using [Meson](https://mesonbuild.com/) version
+0.43 or higher. Meson relies itself on Ninja, Pkg-config and few other
+compilation tools.
+
+Install first the few common compilation tools:
+* Meson
+* Ninja
+* Pkg-config
+
+These tools should be packaged if you use a cutting edge operating
+system. If not, have a look to the "Troubleshooting" section.
+
+Compilation
+-----------
+
+Once all dependencies are installed, you can compile ZIM library with:
+```bash
+meson . build
+ninja -C build
+```
+
+By default, it will compile dynamic linked libraries. All binary files
+will be created in the `build` directory created automatically by
+Meson. If you want statically linked libraries, you can add
+`--default-library=static` option to the Meson command.
+
+Depending of you system, `ninja` may be called `ninja-build`.
+
+Installation
+------------
+
+If you want to install the libzim and the headers you just have
+compiled on your system, here we go:
+```bash
+ninja -C build install
+```
+
+You might need to run the command as root (or using `sudo`), depending
+where you want to install the libraries. After the installation
+succeeded, you may need to run ldconfig (as root).
+
+Uninstallation
+------------
+
+If you want to uninstall the libzim:
+```bash
+ninja -C build uninstall
+```
+
+Like for the installation, you might need to run the command as root
+(or using `sudo`).
+
+Troubleshooting
+---------------
+
+If you need to install Meson "manually":
+```bash
+virtualenv -p python3 ./ # Create virtualenv
+source bin/activate      # Activate the virtualenv
+pip3 install meson       # Install Meson
+hash -r                  # Refresh bash paths
+```
+
+If you need to install Ninja "manually":
+```bash
+git clone git://github.com/ninja-build/ninja.git
+cd ninja
+git checkout release
+./configure.py --bootstrap
+mkdir ../bin
+cp ninja ../bin
+cd ..
+```
+
+If the automated tests fail or timeout, you need to be aware that this
+test suite needs up to 16GB of memory. You can skip this specific tests with:
+```bash
+SKIP_BIG_MEMORY_TEST=1 ninja test
+```
+
+If the compilation still fails, you might need to get a more recent
+version of a dependency than the one packaged by your Linux
+distribution. Try then with a source tarball distributed by the
+problematic upstream project or even directly from the source code
+repository.
+
+License
+-------
+
+[GPLv2](https://www.gnu.org/licenses/old-licenses/gpl-2.0.en.html) or later, see [COPYING](COPYING) for more details.
diff --git a/examples/createZimExample.cpp b/examples/createZimExample.cpp

new file mode 100644 (file)

index 0000000..2d97b75
--- /dev/null
+++ b/examples/createZimExample.cpp
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2012 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <iostream>
+#include <sstream>
+#include <vector>
+#include <zim/writer/creator.h>
+#include <zim/blob.h>
+
+class TestArticle : public zim::writer::Article
+{
+    std::string _id;
+    std::string _data;
+
+  public:
+    TestArticle()  { }
+    explicit TestArticle(const std::string& id);
+    virtual ~TestArticle() = default;
+
+    virtual std::string getAid() const;
+    virtual zim::writer::Url getUrl() const;
+    virtual std::string getTitle() const;
+    virtual bool isRedirect() const;
+    virtual bool shouldCompress() const { return true; }
+    virtual std::string getMimeType() const;
+    virtual zim::writer::Url getRedirectUrl() const;
+    virtual bool shouldIndex() const { return false; }
+    virtual zim::size_type getSize() const { return _data.size(); }
+    virtual std::string getFilename() const { return ""; }
+
+    virtual zim::Blob getData() const
+    { return zim::Blob(&_data[0], _data.size()); }
+};
+
+TestArticle::TestArticle(const std::string& id)
+  : _id(id)
+{
+  std::ostringstream data;
+  data << "this is article " << id << std::endl;
+  _data = data.str();
+}
+
+std::string TestArticle::getAid() const
+{
+  return _id;
+}
+
+zim::writer::Url TestArticle::getUrl() const
+{
+  return zim::writer::Url('A', _id);
+}
+
+std::string TestArticle::getTitle() const
+{
+  return _id;
+}
+
+bool TestArticle::isRedirect() const
+{
+  return false;
+}
+
+std::string TestArticle::getMimeType() const
+{
+  return "text/plain";
+}
+
+zim::writer::Url TestArticle::getRedirectUrl() const
+{
+  return zim::writer::Url();
+}
+
+int main(int argc, char* argv[])
+{
+  unsigned max = 16;
+  try {
+    zim::writer::Creator c;
+    c.startZimCreation("foo.zim");
+    for (unsigned n = 0; n < max; ++n)
+    {
+      std::ostringstream id;
+      id << (n + 1);
+      auto article = std::make_shared<TestArticle>(id.str());
+      c.addArticle(article);
+    }
+    c.finishZimCreation();
+  }
+  catch (const std::exception& e)
+  {
+    std::cerr << e.what() << std::endl;
+  }
+}
+
diff --git a/examples/meson.build b/examples/meson.build

new file mode 100644 (file)

index 0000000..fb6b77c
--- /dev/null
+++ b/examples/meson.build
@@ -0,0 +1,6 @@
+
+executable('createZimExample', 'createZimExample.cpp',
+           link_with: libzim,
+           link_args: extra_link_args,
+           include_directories: include_directory,
+           dependencies: [thread_dep, xapian_dep, icu_dep, zlib_dep, lzma_dep])
diff --git a/include/meson.build b/include/meson.build

new file mode 100644 (file)

index 0000000..6229ca8
--- /dev/null
+++ b/include/meson.build
@@ -0,0 +1,23 @@
+include_directory = include_directories('.')
+
+install_headers(
+    'zim/article.h',
+    'zim/blob.h',
+    'zim/error.h',
+    'zim/file.h',
+    'zim/fileheader.h',
+    'zim/fileiterator.h',
+    'zim/search.h',
+    'zim/search_iterator.h',
+    'zim/uuid.h',
+    'zim/zim.h',
+    subdir:'zim'
+)
+
+install_headers(
+    'zim/writer/article.h',
+    'zim/writer/url.h',
+    'zim/writer/creator.h',
+    subdir:'zim/writer'
+)
+
diff --git a/include/zim/article.h b/include/zim/article.h

new file mode 100644 (file)

index 0000000..3aa3082
--- /dev/null
+++ b/include/zim/article.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ARTICLE_H
+#define ZIM_ARTICLE_H
+
+#include <string>
+#include "zim.h"
+#include "blob.h"
+#include <limits>
+#include <iosfwd>
+
+#ifdef max
+#undef max
+#endif
+
+namespace zim
+{
+  class Cluster;
+  class Dirent;
+  class FileImpl;
+
+  class Article
+  {
+    private:
+      std::shared_ptr<FileImpl> file;
+      article_index_type idx;
+
+      std::shared_ptr<const Dirent> getDirent() const;
+
+    public:
+      Article()
+        : idx(std::numeric_limits<article_index_type>::max())
+          { }
+
+      Article(std::shared_ptr<FileImpl> file_, article_index_type idx_)
+        : file(file_),
+          idx(idx_)
+          { }
+
+      std::string getParameter() const;
+
+      std::string getTitle() const;
+      std::string getUrl() const;
+      std::string getLongUrl() const;
+
+      uint16_t    getLibraryMimeType() const;
+      const std::string&  getMimeType() const;
+
+      bool        isRedirect() const;
+      bool        isLinktarget() const;
+      bool        isDeleted() const;
+
+      char        getNamespace() const;
+
+      article_index_type   getRedirectIndex() const;
+      Article     getRedirectArticle() const;
+
+      size_type   getArticleSize() const;
+
+      bool operator< (const Article& a) const
+        { return getNamespace() < a.getNamespace()
+              || (getNamespace() == a.getNamespace()
+               && getTitle() < a.getTitle()); }
+
+      std::shared_ptr<const Cluster> getCluster() const;
+      cluster_index_type getClusterNumber() const;
+
+      Blob getData(offset_type offset=0) const;
+      Blob getData(offset_type offset, size_type size) const;
+
+      offset_type getOffset() const;
+      std::pair<std::string, offset_type> getDirectAccessInformation() const;
+
+      std::string getPage(bool layout = true, unsigned maxRecurse = 10);
+      void getPage(std::ostream&, bool layout = true, unsigned maxRecurse = 10);
+
+      article_index_type   getIndex() const   { return idx; }
+
+      bool good() const   { return idx != std::numeric_limits<article_index_type>::max(); }
+  };
+
+}
+
+#endif // ZIM_ARTICLE_H
+
diff --git a/include/zim/blob.h b/include/zim/blob.h

new file mode 100644 (file)

index 0000000..928394e
--- /dev/null
+++ b/include/zim/blob.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_BLOB_H
+#define ZIM_BLOB_H
+
+#include "zim.h"
+
+#include <iostream>
+#include <string>
+#include <algorithm>
+#include <memory>
+
+namespace zim
+{
+  class Buffer;
+  class Blob
+  {
+      const char* _data;
+      size_type _size;
+      std::shared_ptr<const Buffer> _buffer;
+
+    public:
+      Blob();
+      Blob(const char* data, size_type size);
+      Blob(std::shared_ptr<const Buffer> buffer);
+
+      operator std::string() const { return std::string(_data, _size); }
+      const char* data() const  { return _data; }
+      const char* end() const   { return _data + _size; }
+      size_type size() const     { return _size; }
+  };
+
+  inline std::ostream& operator<< (std::ostream& out, const Blob& blob)
+  {
+    if (blob.data())
+      out.write(blob.data(), blob.size());
+    return out;
+  }
+
+  inline bool operator== (const Blob& b1, const Blob& b2)
+  {
+    return b1.size() == b2.size()
+        && std::equal(b1.data(), b1.data() + b1.size(), b2.data());
+  }
+}
+
+#endif // ZIM_BLOB_H
diff --git a/include/zim/error.h b/include/zim/error.h

new file mode 100644 (file)

index 0000000..fb59e0d
--- /dev/null
+++ b/include/zim/error.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ERROR_H
+#define ZIM_ERROR_H
+
+#include <stdexcept>
+
+namespace zim
+{
+  class ZimFileFormatError : public std::runtime_error
+  {
+    public:
+      explicit ZimFileFormatError(const std::string& msg)
+        : std::runtime_error(msg)
+        { }
+  };
+
+}
+
+#endif // ZIM_ERROR_H
+
diff --git a/include/zim/file.h b/include/zim/file.h

new file mode 100644 (file)

index 0000000..970aaa0
--- /dev/null
+++ b/include/zim/file.h
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2006,2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILE_H
+#define ZIM_FILE_H
+
+#include <string>
+#include <iterator>
+#include <memory>
+#include "zim.h"
+#include "article.h"
+#include "blob.h"
+#include "fileheader.h"
+
+namespace zim
+{
+  class Search;
+  class FileImpl;
+  class Cluster;
+
+  class File
+  {
+    std::shared_ptr<FileImpl> impl;
+
+    public:
+      File()
+        { }
+      explicit File(const std::string& fname);
+
+      const std::string& getFilename() const;
+      const Fileheader& getFileheader() const;
+      offset_type getFilesize() const;
+
+      article_index_type getCountArticles() const;
+
+      Article getArticle(article_index_type idx) const;
+      Article getArticle(char ns, const std::string& url) const;
+      Article getArticleByUrl(const std::string& url) const;
+      Article getArticleByTitle(article_index_type idx) const;
+      Article getArticleByTitle(char ns, const std::string& title) const;
+      Article getArticleByClusterOrder(article_index_type idx) const;
+
+      std::shared_ptr<const Cluster> getCluster(cluster_index_type idx) const;
+      cluster_index_type getCountClusters() const;
+      offset_type getClusterOffset(cluster_index_type idx) const;
+
+      Blob getBlob(cluster_index_type clusterIdx, blob_index_type blobIdx) const;
+      offset_type getOffset(cluster_index_type clusterIdx, blob_index_type blobIdx) const;
+
+      article_index_type getNamespaceBeginOffset(char ch) const;
+      article_index_type getNamespaceEndOffset(char ch) const;
+      article_index_type getNamespaceCount(char ns) const;
+
+      std::string getNamespaces() const;
+      bool hasNamespace(char ch) const;
+
+      class const_iterator;
+
+      const_iterator begin() const;
+      const_iterator beginByTitle() const;
+      const_iterator beginByUrl() const;
+      const_iterator end() const;
+      const_iterator findByTitle(char ns, const std::string& title) const;
+      const_iterator find(char ns, const std::string& url) const;
+      const_iterator find(const std::string& url) const;
+
+
+      std::unique_ptr<Search> search(const std::string& query, int start, int end) const;
+      std::unique_ptr<Search> suggestions(const std::string& query, int start, int end) const;
+
+      time_t getMTime() const;
+
+      const std::string& getMimeType(uint16_t idx) const;
+
+      std::string getChecksum();
+      bool verify();
+
+      bool is_multiPart() const;
+  };
+
+  std::string urldecode(const std::string& url);
+
+}
+
+#endif // ZIM_FILE_H
+
diff --git a/include/zim/fileheader.h b/include/zim/fileheader.h

new file mode 100644 (file)

index 0000000..4d67809
--- /dev/null
+++ b/include/zim/fileheader.h
@@ -0,0 +1,123 @@
+/*
+ * Copyright (C) 2008 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILEHEADER_H
+#define ZIM_FILEHEADER_H
+
+#include <memory>
+#include "zim.h"
+#include "uuid.h"
+#include <iosfwd>
+#include <limits>
+
+// max may be defined as a macro by window includes
+#ifdef max
+#undef max
+#endif
+
+namespace zim
+{
+  class Buffer;
+  class Fileheader
+  {
+    public:
+      static const uint32_t zimMagic;
+      static const uint16_t zimClassicMajorVersion;
+      static const uint16_t zimExtendedMajorVersion;
+      static const uint16_t zimMinorVersion;
+      static const size_type size;
+
+    private:
+      uint16_t majorVersion;
+      uint16_t minorVersion;
+      Uuid uuid;
+      article_index_type articleCount;
+      offset_type titleIdxPos;
+      offset_type urlPtrPos;
+      offset_type mimeListPos;
+      cluster_index_type clusterCount;
+      offset_type clusterPtrPos;
+      article_index_type mainPage;
+      article_index_type layoutPage;
+      offset_type checksumPos;
+
+    public:
+      Fileheader()
+        : majorVersion(zimClassicMajorVersion),
+          minorVersion(zimMinorVersion),
+          articleCount(0),
+          titleIdxPos(0),
+          urlPtrPos(0),
+          clusterCount(0),
+          clusterPtrPos(0),
+          mainPage(std::numeric_limits<article_index_type>::max()),
+          layoutPage(std::numeric_limits<article_index_type>::max()),
+          checksumPos(std::numeric_limits<offset_type>::max())
+      {}
+
+      void write(int out_fd) const;
+      void read(std::shared_ptr<const Buffer> buffer);
+
+      // Do some sanity check, raise a ZimFileFormateError is
+      // something is wrong.
+      void sanity_check() const;
+
+      uint16_t getMajorVersion() const             { return majorVersion; }
+      void setMajorVersion(uint16_t v)             { majorVersion = v; }
+
+      uint16_t getMinorVersion() const             { return minorVersion; }
+      void setMinorVersion(uint16_t v)             { minorVersion = v; }
+
+      const Uuid& getUuid() const                  { return uuid; }
+      void setUuid(const Uuid& uuid_)              { uuid = uuid_; }
+
+      article_index_type getArticleCount() const            { return articleCount; }
+      void      setArticleCount(article_index_type s)       { articleCount = s; }
+
+      offset_type getTitleIdxPos() const           { return titleIdxPos; }
+      void        setTitleIdxPos(offset_type p)    { titleIdxPos = p; }
+
+      offset_type getUrlPtrPos() const             { return urlPtrPos; }
+      void        setUrlPtrPos(offset_type p)      { urlPtrPos = p; }
+
+      offset_type getMimeListPos() const           { return mimeListPos; }
+      void        setMimeListPos(offset_type p)    { mimeListPos = p; }
+
+      cluster_index_type   getClusterCount() const          { return clusterCount; }
+      void        setClusterCount(cluster_index_type s)     { clusterCount = s; }
+
+      offset_type getClusterPtrPos() const         { return clusterPtrPos; }
+      void        setClusterPtrPos(offset_type p)  { clusterPtrPos = p; }
+
+      bool        hasMainPage() const              { return mainPage != std::numeric_limits<article_index_type>::max(); }
+      article_index_type   getMainPage() const     { return mainPage; }
+      void        setMainPage(article_index_type s){ mainPage = s; }
+
+      bool        hasLayoutPage() const            { return layoutPage != std::numeric_limits<article_index_type>::max(); }
+      article_index_type   getLayoutPage() const   { return layoutPage; }
+      void        setLayoutPage(article_index_type s)       { layoutPage = s; }
+
+      bool        hasChecksum() const              { return getMimeListPos() >= 80; }
+      offset_type getChecksumPos() const           { return hasChecksum() ? checksumPos : 0; }
+      void        setChecksumPos(offset_type p)    { checksumPos = p; }
+  };
+
+}
+
+#endif // ZIM_FILEHEADER_H
diff --git a/include/zim/fileiterator.h b/include/zim/fileiterator.h

new file mode 100644 (file)

index 0000000..ea7943b
--- /dev/null
+++ b/include/zim/fileiterator.h
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILEITERATOR_H
+#define ZIM_FILEITERATOR_H
+
+#include <iterator>
+#include "article.h"
+
+namespace zim
+{
+  class File::const_iterator : public std::iterator<std::bidirectional_iterator_tag, Article>
+  {
+    public:
+      enum Mode {
+        UrlIterator,
+        ArticleIterator,
+        ClusterIterator
+      };
+
+    private:
+      const File* file;
+      article_index_type idx;
+      mutable Article article;
+      Mode mode;
+
+      bool is_end() const  { return file == 0 || idx >= file->getCountArticles(); }
+
+    public:
+      explicit const_iterator(const File* file_, article_index_type idx_, Mode mode_)
+        : file(file_),
+          idx(idx_),
+          mode(mode_)
+      { }
+
+      article_index_type getIndex() const   { return idx; }
+      const File& getFile() const  { return *file; }
+
+      bool operator== (const const_iterator& it) const
+        { return (is_end() && it.is_end())
+              || (file == it.file && idx == it.idx); }
+      bool operator!= (const const_iterator& it) const
+        { return !operator==(it); }
+
+      const_iterator& operator++()
+      {
+        ++idx;
+        article = Article();
+        return *this;
+      }
+
+      const_iterator operator++(int)
+      {
+        const_iterator it = *this;
+        operator++();
+        return it;
+      }
+
+      const_iterator& operator--()
+      {
+        --idx;
+        article = Article();
+        return *this;
+      }
+
+      const_iterator operator--(int)
+      {
+        const_iterator it = *this;
+        operator--();
+        return it;
+      }
+
+      const Article& operator*() const
+      {
+        if (!article.good())
+        {
+          switch(mode)
+          {
+            case UrlIterator:
+             article = file->getArticle(idx);
+             break;
+            case ArticleIterator:
+              article = file->getArticleByTitle(idx);
+              break;
+            case ClusterIterator:
+              article = file->getArticleByClusterOrder(idx);
+              break;
+          }
+        }
+        return article;
+      }
+
+      pointer operator->() const
+      {
+        operator*();
+        return &article;
+      }
+  };
+}
+
+#endif // ZIM_FILEITERATOR_H
+
diff --git a/include/zim/search.h b/include/zim/search.h

new file mode 100644 (file)

index 0000000..bc8d9a5
--- /dev/null
+++ b/include/zim/search.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2007 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SEARCH_H
+#define ZIM_SEARCH_H
+
+#include "search_iterator.h"
+#include <vector>
+#include <string>
+#include <map>
+
+namespace zim
+{
+
+class File;
+class Search
+{
+    friend class search_iterator;
+    friend struct search_iterator::InternalData;
+    public:
+        typedef search_iterator iterator;
+
+        explicit Search(const std::vector<const File*> zimfiles);
+        explicit Search(const File* zimfile);
+        Search(const Search& it);
+        Search& operator=(const Search& it);
+        Search(Search&& it);
+        Search& operator=(Search&& it);
+        ~Search();
+
+        void set_verbose(bool verbose);
+
+        Search& add_zimfile(const File* zimfile);
+        Search& set_query(const std::string& query);
+        Search& set_georange(float latitude, float longitude, float distance);
+        Search& set_range(int start, int end);
+        Search& set_suggestion_mode(bool suggestion_mode);
+
+        search_iterator begin() const;
+        search_iterator end() const;
+        int get_matches_estimated() const;
+
+    private:
+         struct InternalData;
+         std::unique_ptr<InternalData> internal;
+         std::vector<const File*> zimfiles;
+
+         mutable std::map<std::string, int> valuesmap;
+         mutable std::string prefixes;
+         std::string query;
+         float latitude;
+         float longitude;
+         float distance;
+         int range_start;
+         int range_end;
+         bool suggestion_mode;
+         bool geo_query;
+         mutable bool search_started;
+         mutable bool has_database;
+         mutable bool verbose;
+         mutable int estimated_matches_number;
+};
+
+} //namespace zim
+
+#endif // ZIM_SEARCH_H
diff --git a/include/zim/search_iterator.h b/include/zim/search_iterator.h

new file mode 100644 (file)

index 0000000..9a44f32
--- /dev/null
+++ b/include/zim/search_iterator.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SEARCH_ITERATOR_H
+#define ZIM_SEARCH_ITERATOR_H
+
+#include <memory>
+#include <iterator>
+#include "article.h"
+
+namespace zim
+{
+class Search;
+class search_iterator : public std::iterator<std::bidirectional_iterator_tag, Article>
+{
+    friend class zim::Search;
+    public:
+        search_iterator();
+        search_iterator(const search_iterator& it);
+        search_iterator& operator=(const search_iterator& it);
+        search_iterator(search_iterator&& it);
+        search_iterator& operator=(search_iterator&& it);
+        ~search_iterator();
+
+        bool operator== (const search_iterator& it) const;
+        bool operator!= (const search_iterator& it) const;
+
+        search_iterator& operator++();
+        search_iterator operator++(int);
+        search_iterator& operator--();
+        search_iterator operator--(int);
+
+        std::string get_url() const;
+        std::string get_title() const;
+        int get_score() const;
+        std::string get_snippet() const;
+        int get_wordCount() const;
+        int get_size() const;
+        int get_fileIndex() const;
+        reference operator*() const;
+        pointer operator->() const;
+
+    private:
+        struct InternalData;
+        std::unique_ptr<InternalData> internal;
+        search_iterator(InternalData* internal_data);
+
+        bool is_end() const;
+};
+
+} // namespace ziç
+
+#endif // ZIM_SEARCH_ITERATOR_H
diff --git a/include/zim/uuid.h b/include/zim/uuid.h

new file mode 100644 (file)

index 0000000..f86b51c
--- /dev/null
+++ b/include/zim/uuid.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_UUID_H
+#define ZIM_UUID_H
+
+#include <iosfwd>
+#include <algorithm>
+#include <cstring>
+#include <string>
+
+namespace zim
+{
+  struct Uuid
+  {
+    Uuid()
+    {
+      std::memset(data, 0, 16);
+    }
+
+    Uuid(const char uuid[16])
+    {
+      std::copy(uuid, uuid+16, data);
+    }
+
+    static Uuid generate(std::string value = "");
+
+    bool operator== (const Uuid& other) const
+      { return std::equal(data, data+16, other.data); }
+    bool operator!= (const Uuid& other) const
+      { return !(*this == other); }
+    unsigned size() const  { return 16; }
+
+    char data[16];
+  };
+
+  std::ostream& operator<< (std::ostream& out, const Uuid& uuid);
+
+}
+
+#endif // ZIM_UUID_H
diff --git a/include/zim/writer/article.h b/include/zim/writer/article.h

new file mode 100644 (file)

index 0000000..2194955
--- /dev/null
+++ b/include/zim/writer/article.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_ARTICLESOURCE_H
+#define ZIM_WRITER_ARTICLESOURCE_H
+
+#include <stdexcept>
+#include <zim/blob.h>
+#include <zim/zim.h>
+#include <zim/uuid.h>
+#include <zim/writer/url.h>
+#include <string>
+
+namespace zim
+{
+  namespace writer
+  {
+    class ArticleSource;
+    class Article
+    {
+      public:
+        virtual Url getUrl() const = 0;
+        virtual std::string getTitle() const = 0;
+        virtual bool isRedirect() const = 0;
+        virtual bool isLinktarget() const;
+        virtual bool isDeleted() const;
+        virtual std::string getMimeType() const = 0;
+        virtual bool shouldCompress() const = 0;
+        virtual bool shouldIndex() const = 0;
+        virtual Url getRedirectUrl() const = 0;
+        virtual zim::size_type getSize() const = 0;
+        virtual Blob getData() const = 0;
+        virtual std::string getFilename() const = 0;
+        virtual ~Article() = default;
+
+        // returns the next category id, to which the article is assigned to
+        virtual std::string getNextCategory();
+    };
+
+    class Category
+    {
+      public:
+        virtual Blob getData() = 0;
+        virtual std::string getUrl() const = 0;
+        virtual std::string getTitle() const = 0;
+        virtual ~Category() = default;
+    };
+
+    class ArticleSource
+    {
+      public:
+        virtual void setFilename(const std::string& fname) { }
+        virtual const Article* getNextArticle() = 0;
+
+        // After fetching the articles and for each article the category ids
+        // using Article::getNextCategory, the writer has a list of category
+        // ids. Using this list, the writer fetches the category data using
+        // this method.
+        virtual Category* getCategory(const std::string& cid);
+        virtual ~ArticleSource() = default;
+    };
+
+  }
+}
+
+#endif // ZIM_WRITER_ARTICLESOURCE_H
diff --git a/include/zim/writer/creator.h b/include/zim/writer/creator.h

new file mode 100644 (file)

index 0000000..b215f05
--- /dev/null
+++ b/include/zim/writer/creator.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_CREATOR_H
+#define ZIM_WRITER_CREATOR_H
+
+#include <memory>
+#include <zim/zim.h>
+#include <zim/writer/article.h>
+
+namespace zim
+{
+  class Fileheader;
+  namespace writer
+  {
+    class CreatorData;
+    class Creator
+    {
+      public:
+        Creator(bool verbose = false);
+        virtual ~Creator();
+
+        zim::size_type getMinChunkSize() const { return minChunkSize; }
+        void setMinChunkSize(zim::size_type s) { minChunkSize = s; }
+        void setIndexing(bool indexing, std::string language)
+        { withIndex = indexing; indexingLanguage = language; }
+        DEPRECATED void setCompressionThreads(unsigned ct) { nbWorkerThreads = ct; }
+        void setNbWorkerThreads(unsigned ct) { nbWorkerThreads = ct; }
+
+
+        virtual void startZimCreation(const std::string& fname);
+        virtual void addArticle(std::shared_ptr<Article> article);
+        virtual void finishZimCreation();
+
+        virtual Url getMainUrl() const { return Url(); }
+        virtual Url getLayoutUrl() const { return Url(); }
+        virtual zim::Uuid getUuid() const { return Uuid::generate(); }
+
+      private:
+        std::unique_ptr<CreatorData> data;
+        bool verbose;
+        bool withIndex = false;
+        size_t minChunkSize = 1024-64;
+        std::string indexingLanguage;
+        unsigned nbWorkerThreads = 4;
+
+        void fillHeader(Fileheader* header) const;
+        void write() const;
+    };
+  }
+
+}
+
+#endif // ZIM_WRITER_CREATOR_H
diff --git a/include/zim/writer/url.h b/include/zim/writer/url.h

new file mode 100644 (file)

index 0000000..b7fa96d
--- /dev/null
+++ b/include/zim/writer/url.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_URL_H
+#define ZIM_WRITER_URL_H
+
+#include <string>
+
+namespace zim
+{
+  namespace writer
+  {
+    class Url {
+      public:
+        Url() :
+          url(),
+          ns(0)
+        {}
+        Url(char ns, std::string url) :
+          url(url),
+          ns(ns)
+        {}
+        Url(std::string url) :
+          url(url.substr(2)),
+          ns(url[0])
+        {}
+        char getNs() const { return ns; }
+        const std::string& getUrl() const { return url; }
+        std::string getLongUrl() const { return std::string(1, ns) + '/' + url; }
+        bool empty() const { return ns == 0 && url.empty(); }
+      private:
+        std::string url;
+        char ns;
+      friend bool operator< (const Url& lhs, const Url& rhs);
+      friend bool operator== (const  Url& lhs, const Url& rhs);
+    };
+
+    inline bool operator< (const Url& lhs, const Url& rhs) {
+        return lhs.ns < rhs.ns
+          ||   (lhs.ns == rhs.ns && lhs.url < rhs.url);
+    }
+    inline bool operator== (const Url& lhs, const Url& rhs) {
+        return lhs.ns == rhs.ns && lhs.url == rhs.url;
+    }
+  }
+}
+
+#endif // ZIM_WRITER_URL_H
diff --git a/include/zim/zim.h b/include/zim/zim.h

new file mode 100644 (file)

index 0000000..69780ac
--- /dev/null
+++ b/include/zim/zim.h
@@ -0,0 +1,66 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ZIM_H
+#define ZIM_ZIM_H
+
+#include <cstdint>
+
+#ifdef __GNUC__
+#define DEPRECATED __attribute__((deprecated))
+#elif defined(_MSC_VER)
+#define DEPRECATED __declspec(deprecated)
+#else
+#praga message("WARNING: You need to implement DEPRECATED for this compiler")
+#define DEPRECATED
+#endif
+
+
+namespace zim
+{
+  // An index of an article (in a zim file)
+  typedef uint32_t article_index_type;
+
+  // An index of an cluster (in a zim file)
+  typedef uint32_t cluster_index_type;
+
+  // An index of a blog (in a cluster)
+  typedef uint32_t blob_index_type;
+
+  // The size of something (article, zim, cluster, blob, ...)
+  typedef uint64_t size_type;
+
+  // An offset.
+  typedef uint64_t offset_type;
+
+  enum CompressionType
+  {
+    zimcompDefault,
+    zimcompNone,
+    zimcompZip,
+    zimcompBzip2, // Not supported anymore in the libzim
+    zimcompLzma,
+    zimcompZstd
+  };
+
+  static const char MimeHtmlTemplate[] = "text/x-zim-htmltemplate";
+}
+
+#endif // ZIM_ZIM_H
+
diff --git a/meson.build b/meson.build

new file mode 100644 (file)

index 0000000..ab6ebe7
--- /dev/null
+++ b/meson.build
@@ -0,0 +1,83 @@
+project('libzim', ['c', 'cpp'],
+  version : '6.1.3',
+  license : 'GPL2',
+  default_options : ['c_std=c11', 'cpp_std=c++11'])
+
+if build_machine.system() != 'windows'
+  add_project_arguments('-D_LARGEFILE64_SOURCE=1', '-D_FILE_OFFSET_BITS=64', language: 'cpp')
+endif
+
+sizeof_off_t = meson.get_compiler('cpp').sizeof('off_t')
+
+conf = configuration_data()
+conf.set('VERSION', '"@0@"'.format(meson.project_version()))
+conf.set('DIRENT_CACHE_SIZE', get_option('DIRENT_CACHE_SIZE'))
+conf.set('CLUSTER_CACHE_SIZE', get_option('CLUSTER_CACHE_SIZE'))
+conf.set('LZMA_MEMORY_SIZE', get_option('LZMA_MEMORY_SIZE'))
+conf.set10('MMAP_SUPPORT_64', sizeof_off_t==8)
+if target_machine.system() == 'windows'
+    conf.set('ENABLE_USE_MMAP', false)
+else
+    conf.set('ENABLE_USE_MMAP', get_option('USE_MMAP'))
+endif
+conf.set('ENABLE_USE_BUFFER_HEADER', get_option('USE_BUFFER_HEADER'))
+
+static_linkage = get_option('static-linkage')
+static_linkage = static_linkage or get_option('default_library')=='static'
+
+zlib_dep = dependency('zlib', required:false, static:static_linkage)
+conf.set('ENABLE_ZLIB', zlib_dep.found())
+
+lzma_dep = dependency('liblzma', static:static_linkage)
+
+zstd_dep = dependency('libzstd', required:false, static:static_linkage)
+conf.set('ENABLE_ZSTD', zstd_dep.found())
+
+xapian_dep = dependency('xapian-core',
+                        required:false,
+                        static:static_linkage)
+conf.set('ENABLE_XAPIAN', xapian_dep.found())
+
+pkg_requires = ['liblzma']
+if build_machine.system() == 'windows'
+    thread_dep = dependency('libpthreadVC2')
+    pkg_requires += ['libpthreadVC2']
+    extra_link_args = ['-lRpcrt4', '-lWs2_32', '-lwinmm', '-licuuc', '-licuin']
+    extra_cpp_args = ['-DSORTPP_PASS']
+else
+    thread_dep = dependency('threads')
+    extra_link_args = []
+    extra_cpp_args = []
+endif
+if zlib_dep.found()
+    pkg_requires += ['zlib']
+endif
+if zstd_dep.found()
+    pkg_requires += ['libzstd']
+endif
+if xapian_dep.found()
+    pkg_requires += ['xapian-core']
+    icu_dep = dependency('icu-i18n', static:static_linkage)
+    pkg_requires += ['icu-i18n']
+else
+    icu_dep = dependency('icu-i18n', required:false, static:static_linkage)
+endif
+
+gtest_dep = dependency('gtest', main:true, fallback:['gtest', 'gtest_main_dep'], required:false)
+
+inc = include_directories('include')
+
+subdir('include')
+subdir('scripts')
+subdir('static')
+subdir('src')
+subdir('examples')
+subdir('test')
+
+pkg_mod = import('pkgconfig')
+pkg_mod.generate(libraries : libzim,
+                 version : meson.project_version(),
+                 name : 'libzim',
+                 filebase : 'libzim',
+                 description : 'A Library to zim.',
+                 requires : pkg_requires)
diff --git a/meson_options.txt b/meson_options.txt

new file mode 100644 (file)

index 0000000..2175788
--- /dev/null
+++ b/meson_options.txt
@@ -0,0 +1,14 @@
+option('CLUSTER_CACHE_SIZE', type : 'string', value : '16',
+  description : 'set cluster cache size to number (default:16)')
+option('DIRENT_CACHE_SIZE', type : 'string', value : '512',
+  description : 'set dirent cache size to number (default:512)')
+option('LZMA_MEMORY_SIZE', type : 'string', value : '128',
+  description : 'set lzma uncompress memory in MB (default:128)')
+option('USE_MMAP', type: 'boolean', value: true,
+  description: 'Use mmap to avoid copy from file. (default:true, always false on windows)')
+option('USE_BUFFER_HEADER', type: 'boolean', value: true,
+  description: '''Copy (or use mmap) header index buffers. (default:true)
+Header index are used to access articles, having them in memory can improve access speed but on low memory devices it may use to many memory.
+If false, we directly read the index in the file at each article access.''')
+option('static-linkage', type : 'boolean', value : false,
+  description : 'Link statically with the dependencies.')
diff --git a/scripts/libzim-compile-resources b/scripts/libzim-compile-resources

new file mode 100755 (executable)

index 0000000..e4993ba
--- /dev/null
+++ b/scripts/libzim-compile-resources
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+
+'''
+Copyright 2016 Matthieu Gautier <mgautier@kymeria.fr>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 3 of the License, or any
+later version.
+
+This program is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+02110-1301, USA.
+'''
+
+import argparse
+import os.path
+import re
+
+def full_identifier(filename):
+    parts = os.path.normpath(filename).split(os.sep)
+    parts = [to_identifier(part) for part in parts]
+    print(filename, parts)
+    return parts
+
+def to_identifier(name):
+    ident = re.sub(r'[^0-9a-zA-Z]', '_', name)
+    if ident[0].isnumeric():
+        return "_"+ident
+    return ident
+
+resource_impl_template = """
+static const unsigned char {data_identifier}[] = {{
+    {resource_content}
+}};
+
+namespace RESOURCE {{
+{namespaces_open}
+const std::string {identifier} = init_resource("{env_identifier}", {data_identifier}, {resource_len});
+{namespaces_close}
+}}
+"""
+
+resource_getter_template = """
+    if (name == "{common_name}")
+        return RESOURCE::{identifier};
+"""
+
+resource_decl_template = """{namespaces_open}
+extern const std::string {identifier};
+{namespaces_close}"""
+
+class Resource:
+    def __init__(self, base_dirs, filename):
+        filename = filename.strip()
+        self.filename = filename
+        self.identifier = full_identifier(filename)
+        found = False
+        for base_dir in base_dirs:
+            try:
+                with open(os.path.join(base_dir, filename), 'rb') as f:
+                    self.data = f.read()
+                found = True
+                break
+            except FileNotFoundError:
+                continue
+        if not found:
+            raise Exception("Impossible to found {}".format(filename))
+
+    def dump_impl(self):
+        nb_row = len(self.data)//16 + (1 if len(self.data) % 16 else 0)
+        sliced = (self.data[i*16:(i+1)*16] for i in range(nb_row))
+
+        return resource_impl_template.format(
+            data_identifier="_".join([""]+self.identifier),
+            resource_content=",\n    ".join(", ".join("{:#04x}".format(i) for i in r) for r in sliced),
+            resource_len=len(self.data),
+            namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), 
+            namespaces_close=" ".join(["}"]*(len(self.identifier)-1)),
+            identifier=self.identifier[-1],
+            env_identifier="RES_"+"_".join(self.identifier)+"_PATH"
+        )
+    
+    def dump_getter(self):
+        return resource_getter_template.format(
+            common_name=self.filename,
+            identifier="::".join(self.identifier)
+        )
+
+    def dump_decl(self):
+        return resource_decl_template.format(
+            namespaces_open=" ".join("namespace {} {{".format(id) for id in self.identifier[:-1]), 
+            namespaces_close=" ".join(["}"]*(len(self.identifier)-1)),
+            identifier=self.identifier[-1]
+        )
+    
+
+
+master_c_template = """//This file is automaically generated. Do not modify it.
+
+#include <stdlib.h>
+#include <fstream>
+#include "{include_file}"
+
+static std::string init_resource(const char* name, const unsigned char* content, int len)
+{{
+    char * resPath = getenv(name);
+    if (NULL == resPath)
+        return std::string(reinterpret_cast<const char*>(content), len);
+    
+    std::ifstream ifs(resPath);
+    if (!ifs.good())
+        return std::string(reinterpret_cast<const char*>(content), len);
+    return std::string( (std::istreambuf_iterator<char>(ifs)),
+                        (std::istreambuf_iterator<char>()   ));
+}}
+
+const std::string& getResource_{basename}(const std::string& name) {{
+{RESOURCES_GETTER}
+    throw ResourceNotFound("Resource not found.");
+}}
+
+{RESOURCES}
+
+"""
+
+def gen_c_file(resources, basename):
+    return master_c_template.format(
+       RESOURCES="\n\n".join(r.dump_impl() for r in resources),
+       RESOURCES_GETTER="\n\n".join(r.dump_getter() for r in resources),
+       include_file=basename,
+       basename=to_identifier(basename)
+    )
+ 
+
+
+master_h_template = """//This file is automaically generated. Do not modify it.
+#ifndef KIWIX_{BASENAME}
+#define KIWIX_{BASENAME}
+
+#include <string>
+#include <stdexcept>
+
+namespace RESOURCE {{
+    {RESOURCES}
+}};
+
+class ResourceNotFound : public std::runtime_error {{
+  public:
+    ResourceNotFound(const std::string& what_arg):
+      std::runtime_error(what_arg)
+    {{ }};
+}};
+
+const std::string& getResource_{basename}(const std::string& name);
+
+#define getResource(a) (getResource_{basename}(a))
+
+#endif // KIWIX_{BASENAME}
+
+"""
+
+def gen_h_file(resources, basename):
+    return master_h_template.format(
+       RESOURCES="\n    ".join(r.dump_decl() for r in resources),
+       BASENAME=basename.upper(),
+       basename=basename,
+    )
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--cxxfile',
+                        help='The Cpp file name to generate')
+    parser.add_argument('--hfile',
+                        help='The h file name to generate')
+    parser.add_argument('--source_dir',
+                        help="Additional directory where to look for resources.",
+                        action='append')
+    parser.add_argument('resource_file',
+                        help='The list of resources to compile.')
+    args = parser.parse_args()
+
+    base_dir = os.path.dirname(os.path.realpath(args.resource_file))
+    source_dir = args.source_dir or []
+    with open(args.resource_file, 'r') as f:
+        resources = [Resource([base_dir]+source_dir, filename)
+                        for filename in f.readlines()]
+
+    h_identifier = to_identifier(os.path.basename(args.hfile))
+    with open(args.hfile, 'w') as f:
+        f.write(gen_h_file(resources, h_identifier))
+
+    with open(args.cxxfile, 'w') as f:
+        f.write(gen_c_file(resources, os.path.basename(args.hfile)))
+
diff --git a/scripts/meson.build b/scripts/meson.build

new file mode 100644 (file)

index 0000000..e1437ae
--- /dev/null
+++ b/scripts/meson.build
@@ -0,0 +1,2 @@
+
+res_compiler = find_program('libzim-compile-resources')
diff --git a/src/_dirent.h b/src/_dirent.h

new file mode 100644 (file)

index 0000000..767db5c
--- /dev/null
+++ b/src/_dirent.h
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_DIRENT_H
+#define ZIM_DIRENT_H
+
+#include <string>
+#include <zim/zim.h>
+#include <exception>
+#include <memory>
+
+#include "zim_types.h"
+#include "debug.h"
+
+namespace zim
+{
+  class Buffer;
+  class InvalidSize : public std::exception {};
+  class Dirent
+  {
+    protected:
+      uint16_t mimeType;
+
+      uint32_t version;
+
+      cluster_index_t clusterNumber;  // only used when redirect is false
+      blob_index_t blobNumber;    // only used when redirect is false
+
+      article_index_t redirectIndex;  // only used when redirect is true
+
+      char ns;
+      std::string title;
+      std::string url;
+      std::string parameter;
+
+    public:
+      // these constants are put into mimeType field
+      static const uint16_t redirectMimeType = 0xffff;
+      static const uint16_t linktargetMimeType = 0xfffe;
+      static const uint16_t deletedMimeType = 0xfffd;
+
+      Dirent()
+        : mimeType(0),
+          version(0),
+          clusterNumber(0),
+          blobNumber(0),
+          redirectIndex(0),
+          ns('\0')
+      {}
+
+      Dirent(std::unique_ptr<Buffer> buffer);
+
+      bool isRedirect() const                 { return mimeType == redirectMimeType; }
+      bool isLinktarget() const               { return mimeType == linktargetMimeType; }
+      bool isDeleted() const                  { return mimeType == deletedMimeType; }
+      bool isArticle() const                  { return !isRedirect() && !isLinktarget() && !isDeleted(); }
+      uint16_t getMimeType() const            { return mimeType; }
+
+      uint32_t getVersion() const            { return version; }
+      void setVersion(uint32_t v)            { version = v; }
+
+      cluster_index_t getClusterNumber() const      { return isRedirect() ? cluster_index_t(0) : clusterNumber; }
+      blob_index_t  getBlobNumber() const         { return isRedirect() ? blob_index_t(0) : blobNumber; }
+
+      article_index_t getRedirectIndex() const      { return isRedirect() ? redirectIndex : article_index_t(0); }
+
+      char getNamespace() const               { return ns; }
+      const std::string& getTitle() const     { return title.empty() ? url : title; }
+      const std::string& getUrl() const       { return url; }
+      std::string getLongUrl() const;
+      const std::string& getParameter() const { return parameter; }
+
+      size_t getDirentSize() const
+      {
+        size_t ret = (isRedirect() ? 12 : 16) + url.size() + parameter.size() + 2;
+        if (title != url)
+          ret += title.size();
+        return ret;
+      }
+
+      void setTitle(const std::string& title_)
+      {
+        title = title_;
+      }
+
+      void setUrl(char ns_, const std::string& url_)
+      {
+        ns = ns_;
+        url = url_;
+      }
+
+      void setParameter(const std::string& parameter_)
+      {
+        parameter = parameter_;
+      }
+
+      void setRedirect(article_index_t idx)
+      {
+        redirectIndex = idx;
+        mimeType = redirectMimeType;
+      }
+
+      void setMimeType(uint16_t mime)
+      {
+        mimeType = mime;
+      }
+
+      void setLinktarget()
+      {
+        ASSERT(mimeType, ==, 0);
+        mimeType = linktargetMimeType;
+      }
+
+      void setDeleted()
+      {
+        ASSERT(mimeType, ==, 0);
+        mimeType = deletedMimeType;
+      }
+
+      void setArticle(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_)
+      {
+        ASSERT(mimeType, ==, 0);
+        mimeType = mimeType_;
+        clusterNumber = clusterNumber_;
+        blobNumber = blobNumber_;
+      }
+  };
+}
+
+#endif // ZIM_DIRENT_H
+
diff --git a/src/article.cpp b/src/article.cpp

new file mode 100644 (file)

index 0000000..045228f
--- /dev/null
+++ b/src/article.cpp
@@ -0,0 +1,288 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/article.h>
+#include "template.h"
+#include "_dirent.h"
+#include "cluster.h"
+#include <zim/fileheader.h>
+#include "fileimpl.h"
+#include "file_part.h"
+#include <sstream>
+#include <iostream>
+#include <stdexcept>
+#include <limits>
+#include "log.h"
+
+log_define("zim.article")
+
+namespace zim
+{
+  size_type Article::getArticleSize() const
+  {
+    auto dirent = getDirent();
+    return size_type(file->getCluster(dirent->getClusterNumber())
+                         ->getBlobSize(dirent->getBlobNumber()));
+  }
+
+  namespace
+  {
+    class Ev : public TemplateParser::Event
+    {
+        std::ostream& out;
+        Article& article;
+        std::shared_ptr<FileImpl> file;
+        unsigned maxRecurse;
+
+      public:
+        Ev(std::ostream& out_, Article& article_, std::shared_ptr<FileImpl> file_, unsigned maxRecurse_)
+          : out(out_),
+            article(article_),
+            file(file_),
+            maxRecurse(maxRecurse_)
+          { }
+        void onData(const std::string& data);
+        void onToken(const std::string& token);
+        void onLink(char ns, const std::string& title);
+    };
+
+    void Ev::onData(const std::string& data)
+    {
+      out << data;
+    }
+
+    void Ev::onToken(const std::string& token)
+    {
+      log_trace("onToken(\"" << token << "\")");
+
+      if (token == "title")
+        out << article.getTitle();
+      else if (token == "url")
+        out << article.getUrl();
+      else if (token == "namespace")
+        out << article.getNamespace();
+      else if (token == "content")
+      {
+        if (maxRecurse <= 0)
+          throw std::runtime_error("maximum recursive limit is reached");
+        article.getPage(out, false, maxRecurse - 1);
+      }
+      else
+      {
+        log_warn("unknown token \"" << token  << "\" found in template");
+        out << "<%" << token << "%>";
+      }
+    }
+
+    void Ev::onLink(char ns, const std::string& url)
+    {
+      if (maxRecurse <= 0)
+        throw std::runtime_error("maximum recursive limit is reached");
+      std::pair<bool, article_index_t> r = file->findx(ns, url);
+      if (r.first) {
+          Article(file, article_index_type(r.second)).getPage(out, false, maxRecurse - 1);
+      } else {
+          throw std::runtime_error(std::string("impossible to find article ") + std::string(1, ns) + std::string("/") + url);
+      }
+    }
+
+  }
+
+  std::shared_ptr<const Dirent> Article::getDirent() const
+  {
+    return file->getDirent(article_index_t(idx));
+  }
+
+  std::string Article::getParameter() const
+  {
+    return getDirent()->getParameter();
+  }
+
+  std::string Article::getTitle() const
+  {
+    return getDirent()->getTitle();
+  }
+
+  std::string Article::getUrl() const
+  {
+    return getDirent()->getUrl();
+  }
+
+  std::string Article::getLongUrl() const
+  {
+    return getDirent()->getLongUrl();
+  }
+
+  uint16_t Article::getLibraryMimeType() const
+  {
+    return getDirent()->getMimeType();
+  }
+
+  const std::string& Article::getMimeType() const
+  {
+    return file->getMimeType(getLibraryMimeType());
+  }
+
+  bool Article::isRedirect() const
+  {
+    return getDirent()->isRedirect();
+  }
+
+  bool Article::isLinktarget() const
+  {
+    return getDirent()->isLinktarget();
+  }
+
+  bool Article::isDeleted() const
+  {
+    return getDirent()->isDeleted();
+  }
+
+  char Article::getNamespace() const
+  {
+    return getDirent()->getNamespace();
+  }
+
+  article_index_type Article::getRedirectIndex() const
+  {
+    return article_index_type(getDirent()->getRedirectIndex());
+  }
+
+  Article Article::getRedirectArticle() const
+  {
+    return Article(file, getRedirectIndex());
+  }
+
+  std::shared_ptr<const Cluster> Article::getCluster() const
+  {
+    auto dirent = getDirent();
+    if ( dirent->isRedirect()
+      || dirent->isLinktarget()
+      || dirent->isDeleted() ) {
+      return std::shared_ptr<const Cluster>();
+    }
+    return file->getCluster(dirent->getClusterNumber());
+  }
+  cluster_index_type Article::getClusterNumber() const {
+    auto dirent= getDirent();
+    if ( dirent->isRedirect()
+      || dirent->isLinktarget()
+      || dirent->isDeleted() ) {
+      return std::numeric_limits<cluster_index_type>::max();
+    }
+    return dirent->getClusterNumber().v;
+}
+
+  Blob Article::getData(offset_type offset) const
+  {
+    auto size = getArticleSize()-offset;
+    return getData(offset, size);
+  }
+
+  Blob Article::getData(offset_type offset, size_type size) const
+  {
+    std::shared_ptr<const Cluster> cluster = getCluster();
+    if (!cluster) {
+      return Blob();
+    }
+    return cluster->getBlob(getDirent()->getBlobNumber(), offset_t(offset), zsize_t(size));
+  }
+
+  offset_type Article::getOffset() const
+  {
+    auto dirent = getDirent();
+    if (dirent->isRedirect()
+        || dirent->isLinktarget()
+        || dirent->isDeleted())
+        return 0;
+    return offset_type(file->getBlobOffset(dirent->getClusterNumber(), dirent->getBlobNumber()));
+  }
+
+  std::pair<std::string, offset_type> Article::getDirectAccessInformation() const
+  {
+    auto dirent = getDirent();
+    if ( dirent->isRedirect()
+      || dirent->isLinktarget()
+      || dirent->isDeleted() ) {
+        return std::make_pair("", 0);
+    }
+
+    auto full_offset = file->getBlobOffset(dirent->getClusterNumber(),
+                                           dirent->getBlobNumber());
+
+    if (!full_offset) {
+      // cluster is compressed
+      return std::make_pair("", 0);
+    }
+    auto part_its = file->getFileParts(full_offset, zsize_t(getArticleSize()));
+    auto range = part_its.first->first;
+    auto part = part_its.first->second;
+    if (++part_its.first != part_its.second) {
+      return std::make_pair("", 0);
+    }
+    auto local_offset = full_offset - range.min;
+    return std::make_pair(part->filename(), offset_type(local_offset));
+  }
+
+  std::string Article::getPage(bool layout, unsigned maxRecurse)
+  {
+    std::ostringstream s;
+    getPage(s, layout, maxRecurse);
+    return s.str();
+  }
+
+  void Article::getPage(std::ostream& out, bool layout, unsigned maxRecurse)
+  {
+    log_trace("Article::getPage(" << layout << ", " << maxRecurse << ')');
+
+    if (getMimeType().compare(0, 9, "text/html") == 0 || getMimeType() == MimeHtmlTemplate)
+    {
+      if (layout && file->getFileheader().hasLayoutPage())
+      {
+        Article layoutPage(file, file->getFileheader().getLayoutPage());
+        Blob data = layoutPage.getData();
+
+        Ev ev(out, *this, file, maxRecurse);
+        log_debug("call template parser");
+        TemplateParser parser(&ev);
+        for (const char* p = data.data(); p != data.end(); ++p)
+          parser.parse(*p);
+        parser.flush();
+
+        return;
+      }
+      else if (getMimeType() == MimeHtmlTemplate)
+      {
+        Blob data = getData();
+
+        Ev ev(out, *this, file, maxRecurse);
+        TemplateParser parser(&ev);
+        for (const char* p = data.data(); p != data.end(); ++p)
+          parser.parse(*p);
+        parser.flush();
+
+        return;
+      }
+    }
+
+    // default case - template cases has return above
+    out << getData();
+  }
+
+}
diff --git a/src/blob.cpp b/src/blob.cpp

new file mode 100644 (file)

index 0000000..fe5b82f
--- /dev/null
+++ b/src/blob.cpp
@@ -0,0 +1,49 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+
+#include "zim/blob.h"
+#include "debug.h"
+#include "buffer.h"
+
+namespace zim {
+
+Blob::Blob()
+ : _data(0),
+   _size(0)
+{}
+
+Blob::Blob(const char* data, size_type size)
+ : _data(data),
+   _size(size)
+{
+  ASSERT(size, <, SIZE_MAX);
+  ASSERT(data, <, (void*)(SIZE_MAX-size));
+}
+
+Blob::Blob(std::shared_ptr<const Buffer> buffer)
+ : _data(buffer->data()),
+   _size(size_type(buffer->size())),
+   _buffer(buffer)
+{}
+
+
+
+
+} //zim
diff --git a/src/buffer.cpp b/src/buffer.cpp

new file mode 100644 (file)

index 0000000..a1fa84d
--- /dev/null
+++ b/src/buffer.cpp
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "buffer.h"
+
+#include <sys/stat.h>
+#include <cstdio>
+#include <cstdlib>
+#include <fcntl.h>
+#include <string.h>
+#include <errno.h>
+#include <sstream>
+
+#ifndef _WIN32
+#  include <sys/mman.h>
+#  include <unistd.h>
+#endif
+
+namespace zim {
+
+std::shared_ptr<const Buffer> Buffer::sub_buffer(offset_t offset, zsize_t size) const
+{
+  return std::make_shared<SubBuffer>(shared_from_this(), offset, size);
+}
+
+#ifdef ENABLE_USE_MMAP
+MMapBuffer::MMapBuffer(int fd, offset_t offset, zsize_t size):
+  Buffer(size),
+  _offset(0)
+{
+  offset_t pa_offset(offset.v & ~(sysconf(_SC_PAGE_SIZE) - 1));
+  _offset = offset-pa_offset;
+#if defined(__APPLE__) || defined(__OpenBSD__)
+  #define MAP_FLAGS MAP_PRIVATE
+#else
+  #define MAP_FLAGS MAP_PRIVATE|MAP_POPULATE
+#endif
+#if !MMAP_SUPPORT_64
+  if(pa_offset.v >= INT32_MAX) {
+    throw MMapException();
+  }
+#endif
+  _data = (char*)mmap(NULL, size.v + _offset.v, PROT_READ, MAP_FLAGS, fd, pa_offset.v);
+  if (_data == MAP_FAILED )
+  {
+    std::ostringstream s;
+    s << "Cannot mmap size " << size.v << " at off " << offset.v << " : " << strerror(errno);
+    throw std::runtime_error(s.str());
+  }
+#undef MAP_FLAGS
+}
+
+MMapBuffer::~MMapBuffer()
+{
+  munmap(_data, size_.v + _offset.v);
+}
+
+#endif
+
+} //zim
diff --git a/src/buffer.h b/src/buffer.h

new file mode 100644 (file)

index 0000000..5d07aea
--- /dev/null
+++ b/src/buffer.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_BUFFER_H_
+#define ZIM_BUFFER_H_
+
+#include <cstddef>
+#include <exception>
+#include <memory>
+#include <iostream>
+
+#include "config.h"
+#include "zim_types.h"
+#include "endian_tools.h"
+#include "debug.h"
+
+namespace zim {
+
+class MMapException : std::exception {};
+
+class Buffer : public std::enable_shared_from_this<Buffer> {
+  public:
+    Buffer(zsize_t size)
+      : size_(size)
+    {
+      ASSERT(size_.v, <, SIZE_MAX);
+    };
+    virtual ~Buffer() {};
+    virtual const char* data(offset_t offset=offset_t(0)) const = 0;
+    virtual char at(offset_t offset) const {
+        return *(data(offset));
+    }
+    zsize_t size() const { return size_; }
+    virtual std::shared_ptr<const Buffer> sub_buffer(offset_t offset, zsize_t size) const;
+
+    template<typename T>
+    T as(offset_t offset) const {
+      ASSERT(offset.v, <, size_.v);
+      ASSERT(offset.v+sizeof(T), <=, size_.v);
+      return fromLittleEndian<T>(data(offset));
+    }
+
+  protected:
+    const zsize_t size_;
+};
+
+
+template<bool CLEAN_AT_END>
+class MemoryBuffer : public Buffer {
+  public:
+    MemoryBuffer(const char* buffer, zsize_t size)
+      : Buffer(size),
+        _data(buffer)
+    {}
+
+    virtual ~MemoryBuffer() {
+        if ( CLEAN_AT_END ) {
+          delete [] _data;
+        }
+    }
+
+    const char* data(offset_t offset) const {
+        ASSERT(offset.v, <=, size_.v);
+        return _data + offset.v;
+    }
+  private:
+    const char* _data;
+};
+
+
+#ifdef ENABLE_USE_MMAP
+class MMapBuffer : public Buffer {
+  public:
+    MMapBuffer(int fd, offset_t offset, zsize_t size);
+    ~MMapBuffer();
+
+    const char* data(offset_t offset) const {
+      offset += _offset;
+      return _data + offset.v;
+    }
+
+  private:
+    offset_t _offset;
+    char* _data;
+};
+#endif
+
+
+class SubBuffer : public Buffer {
+  public:
+    SubBuffer(const std::shared_ptr<const Buffer> src, offset_t offset, zsize_t size)
+      : Buffer(size),
+        _data(src, src->data(offset))
+    {
+      ASSERT(offset.v+size.v, <=, src->size().v);
+    }
+
+  const char* data(offset_t offset) const {
+        ASSERT(offset.v, <=, size_.v);
+        return _data.get() + offset.v;
+    }
+
+  private:
+    std::shared_ptr<const char> _data;
+};
+
+};
+
+#endif //ZIM_BUFFER_H_
diff --git a/src/cache.h b/src/cache.h

new file mode 100644 (file)

index 0000000..b889fd9
--- /dev/null
+++ b/src/cache.h
@@ -0,0 +1,345 @@
+/*
+ * Copyright (C) 2008 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_CACHE_H
+#define ZIM_CACHE_H
+
+#include <map>
+#include <limits>
+#include <iostream>
+
+namespace zim
+{
+  /**
+     Implements a container for caching elements.
+
+     The cache holds a list of key-value-pairs. There are 2 main operations for
+     accessing the cache: put and get. Put takes a key and a value and puts the
+     element into the list. Get takes a key and optional a value. If the value
+     for the key is found, it is returned. The passed value otherwise. By
+     default the value is constructed with the empty ctor of the value-type.
+
+     The cache has a maximum size, after which key-value-pairs are dropped,
+     when a new item is put into the cache.
+
+     The algorithm for this cache is as follows:
+       - when the cache is not full, new elements are appended
+       - new elements are put into the middle of the list otherwise
+       - the last element of the list is then dropped
+       - when getting a value and the value is found, it is put to the
+         beginning of the list
+
+     When elements are searched, a linear search is done using the ==-operator
+     of the key type.
+
+     The caching algorithm keeps elements, which are fetched more than once in
+     the first half of the list. In the second half the elements are either new
+     or the elements are pushed from the first half to the second half by other
+     elements, which are found in the cache.
+
+     You should be aware, that the key type should be simple. Comparing keys
+     must be cheap. Copying elements (both key and value) must be possible and
+     should be cheap, since they are moved in the underlying container.
+
+   */
+  template <typename Key, typename Value>
+  class Cache
+  {
+      struct Data
+      {
+        bool winner;
+        unsigned serial;
+        Value value;
+        Data() { }
+        Data(bool winner_, unsigned serial_, const Value& value_)
+          : winner(winner_),
+            serial(serial_),
+            value(value_)
+            { }
+      };
+
+      typedef std::map<Key, Data> DataType;
+      DataType data;
+
+      typename DataType::size_type maxElements;
+      unsigned serial;
+      unsigned hits;
+      unsigned misses;
+
+      unsigned _nextSerial()
+      {
+        if (serial == std::numeric_limits<unsigned>::max())
+        {
+          for (typename DataType::iterator it = data.begin(); it != data.end(); ++it)
+            it->second.serial = 0;
+          serial = 1;
+        }
+
+        return serial++;
+      }
+
+      typename DataType::iterator _getOldest(bool winner)
+      {
+        typename DataType::iterator foundElement = data.begin();
+
+        typename DataType::iterator it = data.begin();
+
+        for (++it; it != data.end(); ++it)
+          if (it->second.winner == winner
+            && (foundElement->second.winner != winner || it->second.serial < foundElement->second.serial))
+              foundElement = it;
+
+        return foundElement;
+      }
+
+      typename DataType::iterator _getNewest(bool winner)
+      {
+        typename DataType::iterator foundElement = data.begin();
+
+        typename DataType::iterator it = data.begin();
+
+        for (++it; it != data.end(); ++it)
+          if (it->second.winner == winner
+            && (foundElement->second.winner != winner || it->second.serial > foundElement->second.serial))
+              foundElement = it;
+
+        return foundElement;
+      }
+
+      // drop one element
+      void _dropLooser()
+      {
+        // look for the oldest element in the list of loosers to drop it
+        data.erase(_getOldest(false));
+      }
+
+      void _makeLooser()
+      {
+        // look for the oldest element in the list of winners to make it a looser
+        typename DataType::iterator it = _getOldest(true);
+        it->second.winner = false;
+        it->second.serial = _nextSerial();
+      }
+
+    public:
+      typedef typename DataType::size_type size_type;
+      typedef Value value_type;
+
+      explicit Cache(size_type maxElements_)
+        : maxElements(maxElements_ + (maxElements_ & 1)),
+          serial(0),
+          hits(0),
+          misses(0)
+        { }
+
+      /// returns the number of elements currently in the cache
+      size_type size() const        { return data.size(); }
+
+      /// returns the maximum number of elements in the cache
+      size_type getMaxElements() const      { return maxElements; }
+
+      void setMaxElements(size_type maxElements_)
+      {
+        size_type numWinners = size() < maxElements / 2 ? size() : maxElements / 2;
+
+        maxElements_ += (maxElements_ & 1);
+
+        if (maxElements_ > maxElements)
+        {
+          maxElements = maxElements_;
+
+          while (numWinners < maxElements / 2)
+          {
+            _getNewest(false)->winner = true;
+            ++numWinners;
+          }
+        }
+        else
+        {
+          while (maxElements > maxElements_)
+          {
+            _dropLooser();
+            _dropLooser();
+            _makeLooser();
+            maxElements -= 2;
+          }
+
+          while (numWinners > maxElements / 2)
+          {
+            _getNewest(true)->winner = false;
+            --numWinners;
+          }
+        }
+      }
+
+      /// removes a element from the cache and returns true, if found
+      bool erase(const Key& key)
+      {
+        typename DataType::iterator it = data.find(key);
+        if (it == data.end())
+          return false;
+
+        if (it->second.winner)
+          _getNewest(false)->winner=true;
+
+        data.erase(it);
+        return true;
+      }
+
+      /// clears the cache.
+      void clear(bool stats = false)
+      {
+        data.clear();
+        if (stats)
+          hits = misses = 0;
+      }
+
+      /// puts a new element in the cache. If the element is already found in
+      /// the cache, it is considered a cache hit and pushed to the top of the
+      /// list.
+      void put(const Key& key, const Value& value)
+      {
+        typename DataType::iterator it;
+        if (data.size() < maxElements)
+        {
+          data.insert(data.begin(),
+            typename DataType::value_type(key,
+              Data(data.size() < maxElements / 2, _nextSerial(), value)));
+        }
+        else if ((it = data.find(key)) == data.end())
+        {
+          // element not found
+          _dropLooser();
+          data.insert(data.begin(),
+            typename DataType::value_type(key,
+              Data(false, _nextSerial(), value)));
+        }
+        else
+        {
+          // element found
+          it->second.serial = _nextSerial();
+          if (!it->second.winner)
+          {
+            // move element to the winner part
+            it->second.winner = true;
+            _makeLooser();
+          }
+        }
+      }
+
+      /// puts a new element on the top of the cache. If the element is already
+      /// found in the cache, it is considered a cache hit and pushed to the
+      /// top of the list. This method actually overrides the need, that a element
+      /// needs a hit to get to the top of the cache.
+      void put_top(const Key& key, const Value& value)
+      {
+        typename DataType::iterator it;
+        if (data.size() < maxElements)
+        {
+          if (data.size() >= maxElements / 2)
+            _makeLooser();
+
+          data.insert(data.begin(),
+            typename DataType::value_type(key,
+              Data(true, _nextSerial(), value)));
+        }
+        else if ((it = data.find(key)) == data.end())
+        {
+          // element not found
+          _dropLooser();
+          _makeLooser();
+          data.insert(data.begin(),
+            typename DataType::value_type(key,
+              Data(true, _nextSerial(), value)));
+        }
+        else
+        {
+          // element found
+          it->second.serial = _nextSerial();
+          if (!it->second.winner)
+          {
+            // move element to the winner part
+            it->second.winner = true;
+            _makeLooser();
+          }
+        }
+      }
+
+      Value* getptr(const Key& key)
+      {
+        typename DataType::iterator it = data.find(key);
+        if (it == data.end())
+          return 0;
+
+        it->second.serial = _nextSerial();
+
+        if (!it->second.winner)
+        {
+          // move element to the winner part
+          it->second.winner = true;
+          _makeLooser();
+        }
+
+        return &it->second.value;
+      }
+
+      /// returns a pair of values - a flag, if the value was found and the
+      /// value if found or the passed default otherwise. If the value is
+      /// found it is a cahce hit and pushed to the top of the list.
+      std::pair<bool, Value> getx(const Key& key, Value def = Value())
+      {
+        Value* v = getptr(key);
+        return v ? std::pair<bool, Value>(true, *v)
+                 : std::pair<bool, Value>(false, def);
+      }
+
+      /// returns the value to a key or the passed default value if not found.
+      /// If the value is found it is a cahce hit and pushed to the top of the
+      /// list.
+      Value get(const Key& key, Value def = Value())
+      {
+        return getx(key, def).second;
+      }
+
+      /// returns the number of hits.
+      unsigned getHits() const    { return hits; }
+      /// returns the number of misses.
+      unsigned getMisses() const  { return misses; }
+      /// returns the cache hit ratio between 0 and 1.
+      double hitRatio() const     { return hits+misses > 0 ? static_cast<double>(hits)/static_cast<double>(hits+misses) : 0; }
+      /// returns the ratio, between held elements and maximum elements.
+      double fillfactor() const   { return static_cast<double>(data.size()) / static_cast<double>(maxElements); }
+
+/*
+      void dump(std::ostream& out) const
+      {
+        out << "cache max size=" << maxElements << " current size=" << size() << '\n';
+        for (typename DataType::const_iterator it = data.begin(); it != data.end(); ++it)
+        {
+          out << "\tkey=\"" << it->first << "\" value=\"" << it->second.value << "\" serial=" << it->second.serial << " winner=" << it->second.winner << '\n';
+        }
+        out << "--------\n";
+      }
+*/
+
+  };
+
+}
+
+#endif // ZIM_CACHE_H
diff --git a/src/cluster.cpp b/src/cluster.cpp

new file mode 100644 (file)

index 0000000..1da439b
--- /dev/null
+++ b/src/cluster.cpp
@@ -0,0 +1,143 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "cluster.h"
+#include <zim/blob.h>
+#include <zim/error.h>
+#include "file_reader.h"
+#include "endian_tools.h"
+#include <algorithm>
+#include <stdlib.h>
+#include <sstream>
+
+#include "log.h"
+
+#include "config.h"
+
+log_define("zim.cluster")
+
+#define log_debug1(e)
+
+namespace zim
+{
+  Cluster::Cluster(std::shared_ptr<const Reader> reader_, CompressionType comp, bool isExtended)
+    : compression(comp),
+      isExtended(isExtended),
+      reader(reader_),
+      startOffset(0)
+  {
+    auto d = reader->offset();
+    if (isExtended) {
+      startOffset = read_header<uint64_t>();
+    } else {
+      startOffset = read_header<uint32_t>();
+    }
+    reader = reader->sub_reader(startOffset);
+    auto d1 = reader->offset();
+    ASSERT(d+startOffset, ==, d1);
+  }
+
+  /* This return the number of char read */
+  template<typename OFFSET_TYPE>
+  offset_t Cluster::read_header()
+  {
+    // read first offset, which specifies, how many offsets we need to read
+    OFFSET_TYPE offset;
+    offset = reader->read<OFFSET_TYPE>(offset_t(0));
+
+    size_t n_offset = offset / sizeof(OFFSET_TYPE);
+    offset_t data_address(offset);
+
+    // read offsets
+    offsets.clear();
+    offsets.reserve(n_offset);
+    offsets.push_back(offset_t(0));
+    
+    auto buffer = reader->get_buffer(offset_t(0), zsize_t(offset));
+    offset_t current = offset_t(sizeof(OFFSET_TYPE));
+    while (--n_offset)
+    {
+      OFFSET_TYPE new_offset = buffer->as<OFFSET_TYPE>(current);
+      ASSERT(new_offset, >=, offset);
+      ASSERT(offset, >=, data_address.v);
+      ASSERT(offset, <=, reader->size().v);
+      
+      offset = new_offset;
+      offsets.push_back(offset_t(offset - data_address.v));
+      current += sizeof(OFFSET_TYPE);
+    }
+    ASSERT(offset, ==, reader->size().v);
+    return data_address;
+  }
+
+  Blob Cluster::getBlob(blob_index_t n) const
+  {
+    if (size()) {
+      auto blobSize = getBlobSize(n);
+      if (blobSize.v > SIZE_MAX) {
+        return Blob();
+      }
+      auto buffer = reader->get_buffer(offsets[blob_index_type(n)], getBlobSize(n));
+      return Blob(buffer);
+    } else {
+      return Blob();
+    }
+  }
+
+  Blob Cluster::getBlob(blob_index_t n, offset_t offset, zsize_t size) const
+  {
+    if (this->size()) {
+      offset += offsets[blob_index_type(n)];
+      size = std::min(size, getBlobSize(n));
+      if (size.v > SIZE_MAX) {
+        return Blob();
+      }
+      auto buffer = reader->get_buffer(offset, size);
+      return Blob(buffer);
+    } else {
+      return Blob();
+    }
+  }
+
+  zsize_t Cluster::size() const
+  {
+    if (isExtended)
+      return zsize_t(offsets.size() * sizeof(uint64_t) + reader->size().v);
+    else
+      return zsize_t(offsets.size() * sizeof(uint32_t) + reader->size().v);
+  }
+
+  template<typename OFFSET_TYPE>
+  zsize_t _read_size(const Reader* reader, offset_t offset)
+  {
+    OFFSET_TYPE blob_offset = reader->read<OFFSET_TYPE>(offset);
+    auto off = offset+offset_t(blob_offset-sizeof(OFFSET_TYPE));
+    auto s = reader->read<OFFSET_TYPE>(off);
+    return zsize_t(s);
+  }
+
+  zsize_t Cluster::read_size(const Reader* reader, bool isExtended, offset_t offset)
+  {
+    if (isExtended)
+      return _read_size<uint64_t>(reader, offset);
+    else
+      return _read_size<uint32_t>(reader, offset);
+  }
+
+}
diff --git a/src/cluster.h b/src/cluster.h

new file mode 100644 (file)

index 0000000..c376e96
--- /dev/null
+++ b/src/cluster.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_CLUSTER_H
+#define ZIM_CLUSTER_H
+
+#include <zim/zim.h>
+#include "buffer.h"
+#include "zim_types.h"
+#include "file_reader.h"
+#include <iosfwd>
+#include <vector>
+#include <memory>
+
+#include "zim_types.h"
+
+namespace zim
+{
+  class Blob;
+  class Reader;
+
+  class Cluster : public std::enable_shared_from_this<Cluster> {
+      typedef std::vector<offset_t> Offsets;
+
+      const CompressionType compression;
+      const bool isExtended;
+      Offsets offsets;
+      std::shared_ptr<const Reader> reader;
+      offset_t startOffset;
+
+      template<typename OFFSET_TYPE>
+      offset_t read_header();
+
+    public:
+      Cluster(std::shared_ptr<const Reader> reader, CompressionType comp, bool isExtended);
+      CompressionType getCompression() const   { return compression; }
+      bool isCompressed() const                { return compression != zimcompDefault && compression != zimcompNone; }
+
+      blob_index_t count() const               { return blob_index_t(offsets.size() - 1); }
+      zsize_t size() const;
+
+      zsize_t getBlobSize(blob_index_t n) const  { return zsize_t(offsets[blob_index_type(n)+1].v
+                                                                - offsets[blob_index_type(n)].v); }
+      offset_t getBlobOffset(blob_index_t n) const { return startOffset + offsets[blob_index_type(n)]; }
+      Blob getBlob(blob_index_t n) const;
+      Blob getBlob(blob_index_t n, offset_t offset, zsize_t size) const;
+      void clear();
+
+      void init_from_buffer(Buffer& buffer);
+      static zsize_t read_size(const Reader* reader, bool isExtended, offset_t offset);
+  };
+
+}
+
+#endif // ZIM_CLUSTER_H
diff --git a/src/compression.cpp b/src/compression.cpp

new file mode 100644 (file)

index 0000000..436f565
--- /dev/null
+++ b/src/compression.cpp
@@ -0,0 +1,222 @@
+#include "compression.h"
+
+#include "envvalue.h"
+
+#include <stdexcept>
+#include <zlib.h>
+
+const std::string LZMA_INFO::name = "lzma";
+void LZMA_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
+{
+  *stream = LZMA_STREAM_INIT;
+  unsigned memsize = zim::envMemSize("ZIM_LZMA_MEMORY_SIZE", LZMA_MEMORY_SIZE * 1024 * 1024);
+  auto errcode = lzma_stream_decoder(stream, memsize, 0);
+  if (errcode != LZMA_OK) {
+    throw std::runtime_error("Impossible to allocated needed memory to uncompress lzma stream");
+  }
+}
+
+void LZMA_INFO::init_stream_encoder(stream_t* stream, char* raw_data)
+{
+  *stream = LZMA_STREAM_INIT;
+  auto errcode = lzma_easy_encoder(stream, 9 | LZMA_PRESET_EXTREME, LZMA_CHECK_CRC32);
+  if (errcode != LZMA_OK) {
+    throw std::runtime_error("Cannot initialize lzma_easy_encoder");
+  }
+}
+
+CompStatus LZMA_INFO::stream_run_encode(stream_t* stream, CompStep step) {
+  return stream_run(stream, step);
+}
+
+CompStatus LZMA_INFO::stream_run_decode(stream_t* stream, CompStep step) {
+  return stream_run(stream, step);
+}
+
+CompStatus LZMA_INFO::stream_run(stream_t* stream, CompStep step)
+{
+  auto errcode = lzma_code(stream, step==CompStep::STEP?LZMA_RUN:LZMA_FINISH);
+  if (errcode == LZMA_BUF_ERROR)
+    return CompStatus::BUF_ERROR;
+  if (errcode == LZMA_STREAM_END)
+    return CompStatus::STREAM_END;
+  if (errcode == LZMA_OK)
+    return CompStatus::OK;
+  return CompStatus::OTHER;
+}
+
+void LZMA_INFO::stream_end_decode(stream_t* stream)
+{
+  lzma_end(stream);
+}
+
+void LZMA_INFO::stream_end_encode(stream_t* stream)
+{
+  lzma_end(stream);
+}
+
+
+#if defined(ENABLE_ZLIB)
+const std::string ZIP_INFO::name = "zlib";
+void ZIP_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
+{
+  memset(stream, 0, sizeof(stream_t));
+  stream->next_in = (unsigned char*) raw_data;
+  stream->avail_in = 1024;
+  auto errcode = ::inflateInit(stream);
+  if (errcode != Z_OK) {
+    throw std::runtime_error("Impossible to allocated needed memory to uncompress zlib stream");
+  }
+}
+
+void ZIP_INFO::init_stream_encoder(stream_t* stream, char* raw_data)
+{
+  memset(stream, 0, sizeof(z_stream));
+  auto errcode = ::deflateInit(stream, Z_DEFAULT_COMPRESSION);
+  if (errcode != Z_OK) {
+    throw std::runtime_error("Impossible to allocated needed memory to uncompress zlib stream");
+  }
+}
+
+CompStatus ZIP_INFO::stream_run_decode(stream_t* stream, CompStep step) {
+  auto errcode = ::inflate(stream, step==CompStep::STEP?Z_SYNC_FLUSH:Z_FINISH);
+  if (errcode == Z_BUF_ERROR)
+    return CompStatus::BUF_ERROR;
+  if (errcode == Z_STREAM_END)
+    return CompStatus::STREAM_END;
+  if (errcode == Z_OK)
+    return CompStatus::OK;
+  return CompStatus::OTHER;
+}
+
+CompStatus ZIP_INFO::stream_run_encode(stream_t* stream, CompStep step) {
+  auto errcode = ::deflate(stream, step==CompStep::STEP?Z_SYNC_FLUSH:Z_FINISH);
+  if (errcode == Z_BUF_ERROR)
+    return CompStatus::BUF_ERROR;
+  if (errcode == Z_STREAM_END)
+    return CompStatus::STREAM_END;
+  if (errcode == Z_OK)
+    return CompStatus::OK;
+  return CompStatus::OTHER;
+}
+
+void ZIP_INFO::stream_end_decode(stream_t* stream) {
+  auto ret = ::inflateEnd(stream);
+  ASSERT(ret, ==, Z_OK);
+}
+
+void ZIP_INFO::stream_end_encode(stream_t* stream) {
+  auto ret = ::deflateEnd(stream);
+  ASSERT(ret, ==, Z_OK);
+}
+#endif // ENABLE_ZLIB
+
+#if defined(ENABLE_ZSTD)
+const std::string ZSTD_INFO::name = "zstd";
+
+ZSTD_INFO::stream_t::stream_t()
+: next_in(nullptr),
+  avail_in(0),
+  next_out(nullptr),
+  avail_out(0),
+  total_out(0),
+  encoder_stream(nullptr),
+  decoder_stream(nullptr)
+{}
+
+ZSTD_INFO::stream_t::~stream_t()
+{
+  if ( encoder_stream )
+    ::ZSTD_freeCStream(encoder_stream);
+
+  if ( decoder_stream )
+    ::ZSTD_freeDStream(decoder_stream);
+}
+
+void ZSTD_INFO::init_stream_decoder(stream_t* stream, char* raw_data)
+{
+  stream->decoder_stream = ::ZSTD_createDStream();
+  auto ret = ::ZSTD_initDStream(stream->decoder_stream);
+  if (::ZSTD_isError(ret)) {
+    throw std::runtime_error("Failed to initialize Zstd decompression");
+  }
+}
+
+void ZSTD_INFO::init_stream_encoder(stream_t* stream, char* raw_data)
+{
+  stream->encoder_stream = ::ZSTD_createCStream();
+  auto ret = ::ZSTD_initCStream(stream->encoder_stream, ::ZSTD_maxCLevel());
+  if (::ZSTD_isError(ret)) {
+    throw std::runtime_error("Failed to initialize Zstd compression");
+  }
+}
+
+CompStatus ZSTD_INFO::stream_run_encode(stream_t* stream, CompStep step) {
+  ::ZSTD_inBuffer inBuf;
+  inBuf.src = stream->next_in;
+  inBuf.size = stream->avail_in;
+  inBuf.pos = 0;
+
+  ::ZSTD_outBuffer outBuf;
+  outBuf.dst = stream->next_out;
+  outBuf.size = stream->avail_out;
+  outBuf.pos = 0;
+
+  auto ret = step == CompStep::STEP
+           ? ::ZSTD_compressStream(stream->encoder_stream, &outBuf, &inBuf)
+           : ::ZSTD_endStream(stream->encoder_stream, &outBuf);
+  stream->next_in += inBuf.pos;
+  stream->avail_in -= inBuf.pos;
+  stream->next_out += outBuf.pos;
+  stream->avail_out -= outBuf.pos;
+  stream->total_out += outBuf.pos;
+
+  if (::ZSTD_isError(ret))
+    return CompStatus::OTHER;
+
+  if ( step == CompStep::STEP ) {
+    if ( stream->avail_in != 0)
+      ASSERT(stream->avail_out, ==, 0u);
+      return CompStatus::BUF_ERROR;
+  } else if ( ret > 0 ) {
+      return CompStatus::BUF_ERROR;
+  }
+
+  return CompStatus::OK;
+}
+
+CompStatus ZSTD_INFO::stream_run_decode(stream_t* stream, CompStep /*step*/) {
+  ::ZSTD_inBuffer inBuf;
+  inBuf.src = stream->next_in;
+  inBuf.size = stream->avail_in;
+  inBuf.pos = 0;
+
+  ::ZSTD_outBuffer outBuf;
+  outBuf.dst = stream->next_out;
+  outBuf.size = stream->avail_out;
+  outBuf.pos = 0;
+
+  auto ret = ::ZSTD_decompressStream(stream->decoder_stream, &outBuf, &inBuf);
+  stream->next_in += inBuf.pos;
+  stream->avail_in -= inBuf.pos;
+  stream->next_out += outBuf.pos;
+  stream->avail_out -= outBuf.pos;
+  stream->total_out += outBuf.pos;
+
+  if (::ZSTD_isError(ret))
+    return CompStatus::OTHER;
+
+  if (ret == 0)
+    return CompStatus::STREAM_END;
+
+  return CompStatus::BUF_ERROR;
+}
+
+void ZSTD_INFO::stream_end_decode(stream_t* stream)
+{
+}
+
+void ZSTD_INFO::stream_end_encode(stream_t* stream)
+{
+}
+#endif
diff --git a/src/compression.h b/src/compression.h

new file mode 100644 (file)

index 0000000..b76b054
--- /dev/null
+++ b/src/compression.h
@@ -0,0 +1,277 @@
+#ifndef _LIBZIM_COMPRESSION_
+#define _LIBZIM_COMPRESSION_
+
+#include <vector>
+#include "string.h"
+
+#include "file_reader.h"
+#include <zim/error.h>
+
+#include "config.h"
+
+#include <lzma.h>
+#if defined(ENABLE_ZLIB)
+#include <zlib.h>
+#endif
+
+#if defined(ENABLE_ZSTD)
+#include <zstd.h>
+#endif
+
+
+#include "zim_types.h"
+
+//#define DEB(X) std::cerr << __func__ << " " << X << std::endl ;
+#define DEB(X)
+
+enum class CompStep {
+  STEP,
+  FINISH
+};
+
+enum class CompStatus {
+  OK,
+  STREAM_END,
+  BUF_ERROR,
+  OTHER
+};
+
+enum class RunnerStatus {
+  OK,
+  NEED_MORE,
+  ERROR
+};
+
+struct LZMA_INFO {
+  typedef lzma_stream stream_t;
+  static const std::string name;
+  static void init_stream_decoder(stream_t* stream, char* raw_data);
+  static void init_stream_encoder(stream_t* stream, char* raw_data);
+  static CompStatus stream_run_encode(stream_t* stream, CompStep step);
+  static CompStatus stream_run_decode(stream_t* stream, CompStep step);
+  static CompStatus stream_run(stream_t* stream, CompStep step);
+  static void stream_end_encode(stream_t* stream);
+  static void stream_end_decode(stream_t* stream);
+};
+
+
+#if defined(ENABLE_ZLIB)
+struct ZIP_INFO {
+  typedef z_stream stream_t;
+  static const std::string name;
+  static void init_stream_decoder(stream_t* stream, char* raw_data);
+  static void init_stream_encoder(stream_t* stream, char* raw_data);
+  static CompStatus stream_run_encode(stream_t* stream, CompStep step);
+  static CompStatus stream_run_decode(stream_t* stream, CompStep step);
+  static void stream_end_encode(stream_t* stream);
+  static void stream_end_decode(stream_t* stream);
+};
+#endif
+
+#if defined(ENABLE_ZSTD)
+struct ZSTD_INFO {
+  struct stream_t
+  {
+    const unsigned char* next_in;
+    size_t avail_in;
+    unsigned char* next_out;
+    size_t avail_out;
+    size_t total_out;
+
+    ::ZSTD_CStream* encoder_stream;
+    ::ZSTD_DStream* decoder_stream;
+
+    stream_t();
+    ~stream_t();
+  private:
+    stream_t(const stream_t& t) = delete;
+    void operator=(const stream_t& t) = delete;
+  };
+
+  static const std::string name;
+  static void init_stream_decoder(stream_t* stream, char* raw_data);
+  static void init_stream_encoder(stream_t* stream, char* raw_data);
+  static CompStatus stream_run_encode(stream_t* stream, CompStep step);
+  static CompStatus stream_run_decode(stream_t* stream, CompStep step);
+  static void stream_end_encode(stream_t* stream);
+  static void stream_end_decode(stream_t* stream);
+};
+
+#endif
+
+
+namespace zim {
+
+template<typename INFO>
+class Uncompressor
+{
+  public:
+    Uncompressor(size_t initial_size=1024*1024) :
+      ret_data(new char[initial_size]),
+      data_size(initial_size)
+    {}
+    ~Uncompressor() = default;
+
+    void init(char* data) {
+      INFO::init_stream_decoder(&stream, data);
+      stream.next_out = (uint8_t*)ret_data.get();
+      stream.avail_out = data_size;
+    }
+
+    RunnerStatus feed(char* data, size_t size, CompStep step = CompStep::STEP) {
+      stream.next_in = (unsigned char*)data;
+      stream.avail_in = size;
+      auto errcode = CompStatus::OTHER;
+      while (true) {
+        errcode = INFO::stream_run_decode(&stream, step);
+        DEB((int)errcode)
+        if (errcode == CompStatus::BUF_ERROR) {
+          if (stream.avail_in == 0 && stream.avail_out != 0)  {
+            // End of input stream.
+            // compressor hasn't recognize the end of the input stream but there is
+            // no more input.
+            return RunnerStatus::NEED_MORE;
+          } else {
+            //Not enought output size
+            DEB("need memory " << data_size << " " << stream.avail_out << " " << stream.total_out)
+            data_size *= 2;
+            std::unique_ptr<char[]> new_ret_data(new char[data_size]);
+            memcpy(new_ret_data.get(), ret_data.get(), stream.total_out);
+            stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out);
+            stream.avail_out = data_size - stream.total_out;
+            DEB(data_size << " " << stream.avail_out << " " << stream.avail_in)
+            ret_data = std::move(new_ret_data);
+            continue;
+          }
+        }
+        if (errcode == CompStatus::STREAM_END)
+          break;
+        // On first call where lzma cannot progress (no output size).
+        // Lzma return OK. If we return NEED_MORE, then we will try to compress
+        // with new input data, but we should not as current one is not processed.
+        // We must do a second step to have te BUF_ERROR and handle thing correctly.
+        if (errcode == CompStatus::OK) {
+          if (stream.avail_in == 0)
+            break;
+          continue;
+        }
+        return RunnerStatus::ERROR;
+      };
+      return errcode==CompStatus::STREAM_END?RunnerStatus::OK:RunnerStatus::NEED_MORE;
+    }
+
+    std::unique_ptr<char[]> get_data(zim::zsize_t* size) {
+      size->v = stream.total_out;
+      INFO::stream_end_decode(&stream);
+      return std::move(ret_data);
+    }
+
+  private:
+    std::unique_ptr<char[]> ret_data;
+    size_type data_size;
+    typename INFO::stream_t stream;
+};
+
+#define CHUNCK_SIZE ((zim::size_type)(1024))
+/**
+ * Uncompress data of the reader at startOffset.
+ *
+ * @param reader         The reader where the data is.
+ * @param startOffset    The offset where the data is in the reader.
+ * @param dest_size[out] The size of the uncompressed data.
+ * @return A pointer to the uncompressed data. This must be deleted (delete[])
+*/
+template<typename INFO>
+std::unique_ptr<char[]> uncompress(const zim::Reader* reader, zim::offset_t startOffset, zim::zsize_t* dest_size) {
+  // Use a compressor to compress the data.
+  // As we don't know the result size, neither the compressed size,
+  // we have to do chunk by chunk until decompressor is happy.
+  // Let's assume it will be something like the minChunkSize used at creation
+  Uncompressor<INFO> runner(1024*1024);
+  // The input is a buffer of CHUNCK_SIZE char max. It may be less if the last chunk
+  // is at the end of the reader and the reader size is not a multiple of CHUNCK_SIZE.
+  std::vector<char> raw_data(CHUNCK_SIZE);
+
+  DEB("Init")
+  runner.init(raw_data.data());
+
+  zim::size_type availableSize = reader->size().v - startOffset.v;
+  auto ret = RunnerStatus::NEED_MORE;
+  while(ret != RunnerStatus::OK) {
+    if (ret == RunnerStatus::NEED_MORE and availableSize) {
+      zim::size_type inputSize = std::min(availableSize, CHUNCK_SIZE);
+      reader->read(raw_data.data(), startOffset, zim::zsize_t(inputSize));
+      startOffset.v += inputSize;
+      availableSize -= inputSize;
+      DEB("Step " << startOffset.v)
+      ret = runner.feed(raw_data.data(), inputSize);
+      DEB("Ret " << (int)ret)
+    }
+    if (ret == RunnerStatus::ERROR) {
+      throw zim::ZimFileFormatError(std::string("Invalid ") + INFO::name
+                               + std::string(" stream for cluster."));
+    }
+  }
+
+  DEB("Finish")
+  return runner.get_data(dest_size);
+}
+
+template<typename INFO>
+class Compressor
+{
+  public:
+    Compressor(size_t initial_size=1024*1024) :
+      ret_data(new char[initial_size]),
+      ret_size(initial_size)
+    {}
+
+    ~Compressor() = default;
+
+    void init(char* data) {
+      INFO::init_stream_encoder(&stream, data);
+      stream.next_out = (uint8_t*)ret_data.get();
+      stream.avail_out = ret_size;
+    }
+
+    RunnerStatus feed(const char* data, size_t size, CompStep step=CompStep::STEP) {
+      stream.next_in = (unsigned char*)data;
+      stream.avail_in = size;
+      auto errcode = CompStatus::OTHER;
+      while (1) {
+        errcode = INFO::stream_run_encode(&stream, step);
+        if (errcode == CompStatus::BUF_ERROR) {
+          if (stream.avail_out == 0 && stream.avail_in != 0) {
+            //Not enought output size
+            ret_size *= 2;
+            std::unique_ptr<char[]> new_ret_data(new char[ret_size]);
+            memcpy(new_ret_data.get(), ret_data.get(), stream.total_out);
+            stream.next_out = (unsigned char*)(new_ret_data.get() + stream.total_out);
+            stream.avail_out = ret_size - stream.total_out;
+            ret_data = std::move(new_ret_data);
+            continue;
+          }
+        }
+        if (errcode == CompStatus::STREAM_END || errcode == CompStatus::OK)
+          break;
+        return RunnerStatus::ERROR;
+      };
+      return RunnerStatus::NEED_MORE;
+    }
+
+    std::unique_ptr<char[]> get_data(zim::zsize_t* size) {
+      feed(nullptr, 0, CompStep::FINISH);
+      INFO::stream_end_encode(&stream);
+      size->v = stream.total_out;
+      return std::move(ret_data);
+    }
+
+  private:
+    std::unique_ptr<char[]> ret_data;
+    size_t ret_size;
+    typename INFO::stream_t stream;
+};
+
+} // namespace zim
+
+#endif // _LIBZIM_COMPRESSION_
diff --git a/src/config.h.in b/src/config.h.in

new file mode 100644 (file)

index 0000000..78ab74b
--- /dev/null
+++ b/src/config.h.in
@@ -0,0 +1,20 @@
+
+#mesondefine VERSION
+
+#mesondefine DIRENT_CACHE_SIZE
+
+#mesondefine CLUSTER_CACHE_SIZE
+
+#mesondefine LZMA_MEMORY_SIZE
+
+#mesondefine ENABLE_ZLIB
+
+#mesondefine ENABLE_ZSTD
+
+#mesondefine ENABLE_XAPIAN
+
+#mesondefine ENABLE_USE_MMAP
+
+#mesondefine ENABLE_USE_BUFFER_HEADER
+
+#mesondefine MMAP_SUPPORT_64
diff --git a/src/debug.h b/src/debug.h

new file mode 100644 (file)

index 0000000..2cfe8e1
--- /dev/null
+++ b/src/debug.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef DEBUG_H_
+#define DEBUG_H_
+
+#include <iostream>
+#include <stdlib.h>
+
+#if defined (NDEBUG)
+# define ASSERT(left, operator, right) (void(0))
+#else
+
+#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__)
+#include <execinfo.h>
+#endif
+
+template<typename T, typename U>
+void _on_assert_fail(const char* vara, const char* op, const char* varb,
+                     T a, U b, const char* file, int line)  {
+  std::cerr << "\nAssertion failed at "<< file << ":" << line << "\n " <<
+      vara << "[" << a << "] " << op << " " << varb << "[" << b << "]" <<
+      std::endl;
+
+#if !defined(_WIN32) && !defined(__APPLE__) && !defined(__ANDROID__)
+  void *callstack[64];
+  size_t size;
+  size = backtrace(callstack, 64);
+  char** strings = backtrace_symbols(callstack, size);
+  for (size_t i=0; i<size; i++) {
+    std::cerr << strings[i] << std::endl;
+  }
+  free(strings);
+#endif
+  exit(1);
+}
+
+# define ASSERT(left, operator, right) do { auto _left = left; auto _right = right; if (!((_left) operator (_right))) _on_assert_fail(#left, #operator, #right, _left, _right, __FILE__, __LINE__);  } while(0)
+
+#endif
+
+#endif
diff --git a/src/dirent.cpp b/src/dirent.cpp

new file mode 100644 (file)

index 0000000..ea07081
--- /dev/null
+++ b/src/dirent.cpp
@@ -0,0 +1,126 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "_dirent.h"
+#include <zim/zim.h>
+#include "buffer.h"
+#include "endian_tools.h"
+#include "log.h"
+#include <algorithm>
+#include <cstring>
+
+log_define("zim.dirent")
+
+namespace zim
+{
+  //////////////////////////////////////////////////////////////////////
+  // Dirent
+  //
+
+  const uint16_t Dirent::redirectMimeType;
+  const uint16_t Dirent::linktargetMimeType;
+  const uint16_t Dirent::deletedMimeType;
+
+  Dirent::Dirent(std::unique_ptr<Buffer> buffer)
+    : Dirent()
+  {
+    uint16_t mimeType = buffer->as<uint16_t>(offset_t(0));
+    bool redirect = (mimeType == Dirent::redirectMimeType);
+    bool linktarget = (mimeType == Dirent::linktargetMimeType);
+    bool deleted = (mimeType == Dirent::deletedMimeType);
+    uint8_t extraLen = buffer->data()[2];
+    char ns = buffer->data()[3];
+    uint32_t version = buffer->as<uint32_t>(offset_t(4));
+    setVersion(version);
+
+    offset_t current = offset_t(8);
+
+    if (redirect)
+    {
+      article_index_t redirectIndex(buffer->as<article_index_type>(current));
+      current += sizeof(article_index_t);
+
+      log_debug("redirectIndex=" << redirectIndex);
+
+      setRedirect(article_index_t(redirectIndex));
+    }
+    else if (linktarget || deleted)
+    {
+      log_debug("linktarget or deleted entry");
+      setArticle(mimeType, cluster_index_t(0), blob_index_t(0));
+    }
+    else
+    {
+      log_debug("read article entry");
+
+      uint32_t clusterNumber = buffer->as<uint32_t>(current);
+      current += sizeof(uint32_t);
+      uint32_t blobNumber = buffer->as<uint32_t>(current);
+      current += sizeof(uint32_t);
+
+      log_debug("mimeType=" << mimeType << " clusterNumber=" << clusterNumber << " blobNumber=" << blobNumber);
+
+      setArticle(mimeType, cluster_index_t(clusterNumber), blob_index_t(blobNumber));
+    }
+
+    std::string url;
+    std::string title;
+    std::string parameter;
+
+    log_debug("read url, title and parameters");
+
+    offset_type url_size = strnlen(
+      buffer->data(current),
+      buffer->size().v - current.v - extraLen
+    );
+    if (current.v + url_size >= buffer->size().v) {
+      throw(InvalidSize());
+    }
+    url = std::string(buffer->data(current), url_size);
+    current += url_size + 1;
+
+    offset_type title_size = strnlen(
+      buffer->data(current),
+      buffer->size().v - current.v - extraLen
+    );
+    if (current.v + title_size >= buffer->size().v) {
+      throw(InvalidSize());
+    }
+    title = std::string(buffer->data(current), title_size);
+    current += title_size + 1;
+
+    if (current.v + extraLen > buffer->size().v) {
+       throw(InvalidSize());
+    }
+    parameter = std::string(buffer->data(current), extraLen);
+
+    setUrl(ns, url);
+    setTitle(title);
+    setParameter(parameter);
+  }
+
+  std::string Dirent::getLongUrl() const
+  {
+    log_trace("Dirent::getLongUrl()");
+    log_debug("namespace=" << getNamespace() << " title=" << getTitle());
+
+    return std::string(1, getNamespace()) + '/' + getUrl();
+  }
+
+}
diff --git a/src/endian_tools.h b/src/endian_tools.h

new file mode 100644 (file)

index 0000000..9bf6bf7
--- /dev/null
+++ b/src/endian_tools.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ENDIAN_H
+#define ENDIAN_H
+
+#include <algorithm>
+#include <iostream>
+#include <zim/zim.h>
+
+namespace zim
+{
+
+template<typename T, size_t N>
+struct ToLittleEndianImpl;
+
+template<typename T>
+struct ToLittleEndianImpl<T, 2>{
+  static void write(const T& d, char* dst) {
+    uint16_t v = static_cast<uint16_t>(d);
+    dst[0] = static_cast<uint8_t>(v);
+    dst[1] = static_cast<uint8_t>(v>>8);
+  }
+};
+
+template<typename T>
+struct ToLittleEndianImpl<T, 4>{
+  static void write(const T& d, char* dst) {
+    uint32_t v = static_cast<uint32_t>(d);
+    dst[0] = static_cast<uint8_t>(v);
+    dst[1] = static_cast<uint8_t>(v>>8);
+    dst[2] = static_cast<uint8_t>(v>>16);
+    dst[3] = static_cast<uint8_t>(v>>24);
+}
+};
+
+template<typename T>
+struct ToLittleEndianImpl<T, 8>{
+  static void write(const T& d, char* dst) {
+    uint64_t v = static_cast<uint64_t>(d);
+    dst[0] = static_cast<uint8_t>(v);
+    dst[1] = static_cast<uint8_t>(v>>8);
+    dst[2] = static_cast<uint8_t>(v>>16);
+    dst[3] = static_cast<uint8_t>(v>>24);
+    dst[4] = static_cast<uint8_t>(v>>32);
+    dst[5] = static_cast<uint8_t>(v>>40);
+    dst[6] = static_cast<uint8_t>(v>>48);
+    dst[7] = static_cast<uint8_t>(v>>56);
+  }
+};
+
+////////////////////////////////////////////////////////////////////////
+template <typename T>
+inline void toLittleEndian(T d, char* dst)
+{
+  ToLittleEndianImpl<T, sizeof(T)>::write(d, dst);
+}
+
+template <typename T>
+inline T fromLittleEndian(const char* ptr)
+{
+  T ret = 0;
+  for(size_t i=0; i<sizeof(T); i++) {
+    ret |= (static_cast<T>(static_cast<uint8_t>(ptr[i])) << (i*8));
+  }
+  return ret;
+}
+
+}
+
+#endif // ENDIAN_H
+
diff --git a/src/envvalue.cpp b/src/envvalue.cpp

new file mode 100644 (file)

index 0000000..1d5c64f
--- /dev/null
+++ b/src/envvalue.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <sstream>
+#include <stdlib.h>
+
+namespace zim
+{
+  unsigned envValue(const char* env, unsigned def)
+  {
+    const char* v = ::getenv(env);
+    if (v)
+    {
+      std::istringstream s(v);
+      s >> def;
+    }
+    return def;
+  }
+
+  unsigned envMemSize(const char* env, unsigned def)
+  {
+    const char* v = ::getenv(env);
+    if (v)
+    {
+      char unit = '\0';
+      std::istringstream s(v);
+      s >> def >> unit;
+
+      switch (unit)
+      {
+        case 'k':
+        case 'K': def *= 1024; break;
+        case 'm':
+        case 'M': def *= 1024 * 1024; break;
+        case 'g':
+        case 'G': def *= 1024 * 1024 * 1024; break;
+      }
+    }
+    return def;
+  }
+}
+
diff --git a/src/envvalue.h b/src/envvalue.h

new file mode 100644 (file)

index 0000000..d6dffd4
--- /dev/null
+++ b/src/envvalue.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_ENVVALUE_H
+#define ZIM_ENVVALUE_H
+
+namespace zim
+{
+  unsigned envValue(const char* env, unsigned def);
+  unsigned envMemSize(const char* env, unsigned def);
+}
+
+#endif // ZIM_ENVVALUE_H
diff --git a/src/file.cpp b/src/file.cpp

new file mode 100644 (file)

index 0000000..db9b3d3
--- /dev/null
+++ b/src/file.cpp
@@ -0,0 +1,312 @@
+/*
+ * Copyright (C) 2006,2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/file.h>
+#include "fileimpl.h"
+#include <zim/article.h>
+#include <zim/search.h>
+#include "log.h"
+#include <zim/fileiterator.h>
+#include <zim/error.h>
+
+log_define("zim.file")
+
+namespace zim
+{
+  namespace
+  {
+    int hexval(char ch)
+    {
+      if (ch >= '0' && ch <= '9')
+        return ch - '0';
+      if (ch >= 'a' && ch <= 'f')
+        return ch - 'a' + 10;
+      if (ch >= 'A' && ch <= 'F')
+        return ch - 'A' + 10;
+      return -1;
+    }
+  }
+
+  File::File(const std::string& fname)
+    : impl(new FileImpl(fname))
+    { }
+
+  const std::string& File::getFilename() const
+  {
+    return impl->getFilename();
+  }
+
+  const Fileheader& File::getFileheader() const
+  {
+    return impl->getFileheader();
+  }
+
+  size_type File::getFilesize() const
+  {
+    return impl->getFilesize().v;
+  }
+
+  article_index_type File::getCountArticles() const
+  {
+    return article_index_type(impl->getCountArticles());
+  }
+
+  Article File::getArticle(article_index_type idx) const
+  {
+    if (idx >= article_index_type(impl->getCountArticles()))
+      throw ZimFileFormatError("article index out of range");
+    return Article(impl, idx);
+  }
+
+  Article File::getArticle(char ns, const std::string& url) const
+  {
+    log_trace("File::getArticle('" << ns << "', \"" << url << ')');
+    std::pair<bool, article_index_t> r = impl->findx(ns, url);
+    return r.first ? Article(impl, article_index_type(r.second)) : Article();
+  }
+
+  Article File::getArticleByUrl(const std::string& url) const
+  {
+    log_trace("File::getArticle(\"" << url << ')');
+    std::pair<bool, article_index_t> r = impl->findx(url);
+    return r.first ? Article(impl, article_index_type(r.second)) : Article();
+  }
+
+  Article File::getArticleByTitle(article_index_type idx) const
+  {
+    return Article(impl, article_index_type(impl->getIndexByTitle(article_index_t(idx))));
+  }
+
+  Article File::getArticleByTitle(char ns, const std::string& title) const
+  {
+    log_trace("File::getArticleByTitle('" << ns << "', \"" << title << ')');
+    std::pair<bool, article_index_t> r = impl->findxByTitle(ns, title);
+    return r.first
+            ? Article(impl, article_index_type(impl->getIndexByTitle(r.second)))
+            : Article();
+  }
+
+  Article File::getArticleByClusterOrder(article_index_type idx) const
+  {
+      auto res = impl->findxByClusterOrder(idx);
+
+      if (res.first)
+        return Article(impl, res.second.v);
+      else
+        return Article();
+  }
+
+  std::shared_ptr<const Cluster> File::getCluster(cluster_index_type idx) const
+  {
+    return impl->getCluster(cluster_index_t(idx));
+  }
+
+  cluster_index_type File::getCountClusters() const
+  {
+    return cluster_index_type(impl->getCountClusters());
+  }
+
+  offset_type File::getClusterOffset(cluster_index_type idx) const
+  {
+    return offset_type(impl->getClusterOffset(cluster_index_t(idx)));
+  }
+
+  Blob File::getBlob(cluster_index_type clusterIdx, blob_index_type blobIdx) const
+  {
+    return impl->getCluster(cluster_index_t(clusterIdx))->getBlob(blob_index_t(blobIdx));
+  }
+
+  article_index_type File::getNamespaceBeginOffset(char ch) const
+  {
+    return article_index_type(impl->getNamespaceBeginOffset(ch));
+  }
+
+  article_index_type File::getNamespaceEndOffset(char ch) const
+  {
+    return article_index_type(impl->getNamespaceEndOffset(ch));
+  }
+
+  article_index_type File::getNamespaceCount(char ns) const
+  {
+    return getNamespaceEndOffset(ns) - getNamespaceBeginOffset(ns);
+  }
+
+  std::string File::getNamespaces() const
+  {
+    return impl->getNamespaces();
+  }
+
+  bool File::hasNamespace(char ch) const
+  {
+    article_index_t off = impl->getNamespaceBeginOffset(ch);
+    return off < impl->getCountArticles() && impl->getDirent(off)->getNamespace() == ch;
+  }
+
+  File::const_iterator File::begin() const
+  { return const_iterator(this, 0, const_iterator::ClusterIterator); }
+
+  File::const_iterator File::beginByTitle() const
+  { return const_iterator(this, 0, const_iterator::ArticleIterator); }
+
+  File::const_iterator File::beginByUrl() const
+  { return const_iterator(this, 0, const_iterator::UrlIterator); }
+
+  File::const_iterator File::end() const
+  { return const_iterator(this, getCountArticles(), const_iterator::UrlIterator); }
+
+  File::const_iterator File::find(char ns, const std::string& url) const
+  {
+    std::pair<bool, article_index_t> r = impl->findx(ns, url);
+    return File::const_iterator(this, article_index_type(r.second), const_iterator::UrlIterator);
+  }
+
+  File::const_iterator File::find(const std::string& url) const
+  {
+    std::pair<bool, article_index_t> r = impl->findx(url);
+    return File::const_iterator(this, article_index_type(r.second), const_iterator::UrlIterator);
+  }
+
+  File::const_iterator File::findByTitle(char ns, const std::string& title) const
+  {
+    std::pair<bool, article_index_t> r = impl->findxByTitle(ns, title);
+    return File::const_iterator(this, article_index_type(r.second), const_iterator::ArticleIterator);
+  }
+
+  std::unique_ptr<Search> File::search(const std::string& query, int start, int end) const {
+      auto search = std::unique_ptr<Search>(new Search(this));
+      search->set_query(query);
+      search->set_range(start, end);
+      return search;
+  }
+
+  std::unique_ptr<Search> File::suggestions(const std::string& query, int start, int end) const {
+      auto search = std::unique_ptr<Search>(new Search(this));
+      search->set_query(query);
+      search->set_range(start, end);
+      search->set_suggestion_mode(true);
+      return search;
+  }
+
+  offset_type File::getOffset(cluster_index_type clusterIdx, blob_index_type blobIdx) const
+  {
+    return offset_type(impl->getBlobOffset(
+                           cluster_index_t(clusterIdx),
+                           blob_index_t(blobIdx)));
+  }
+
+  time_t File::getMTime() const
+  {
+    return impl->getMTime();
+  }
+
+  const std::string& File::getMimeType(uint16_t idx) const
+  {
+    return impl->getMimeType(idx);
+  }
+
+  std::string File::getChecksum()
+  {
+    return impl->getChecksum();
+  }
+
+  bool File::verify()
+  {
+    return impl->verify();
+  }
+
+  bool File::is_multiPart() const
+  {
+    return impl->is_multiPart();
+  }
+
+
+  std::string urldecode(const std::string& url)
+  {
+    std::string ret;
+    enum {
+      state_0,
+      state_h1,
+      state_h2
+    } state = state_0;
+
+    char ch = '\0';
+    for (std::string::const_iterator it = url.begin(); it != url.end(); ++it)
+    {
+      switch (state)
+      {
+        case state_0:
+          if (*it == '+')
+            ret += ' ';
+          else if (*it == '%')
+            state = state_h1;
+          else
+            ret += *it;
+          break;
+
+        case state_h1:
+          if ( (*it >= '0' && *it <= '9')
+            || (*it >= 'A' && *it <= 'F')
+            || (*it >= 'a' && *it <= 'f'))
+          {
+            ch = *it;
+            state = state_h2;
+          }
+          else
+          {
+            ret += '%';
+            ret += *it;
+            state = state_0;
+          }
+          break;
+
+        case state_h2:
+          if ( (*it >= '0' && *it <= '9')
+            || (*it >= 'A' && *it <= 'F')
+            || (*it >= 'a' && *it <= 'f'))
+          {
+            ret += static_cast<char>(hexval(ch) * 16 + hexval(*it));
+          }
+          else
+          {
+            ret += static_cast<char>(hexval(ch));
+            ret += *it;
+          }
+          state = state_0;
+          break;
+      }
+    }
+
+    switch (state)
+    {
+      case state_0:
+        break;
+
+      case state_h1:
+        ret += '%';
+        break;
+
+      case state_h2:
+        ret += '%';
+        ret += ch;
+        break;
+    }
+
+    return ret;
+  }
+}
diff --git a/src/file_compound.cpp b/src/file_compound.cpp

new file mode 100644 (file)

index 0000000..6d52639
--- /dev/null
+++ b/src/file_compound.cpp
@@ -0,0 +1,111 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "file_compound.h"
+#include "buffer.h"
+
+#include <errno.h>
+#include <string.h>
+#include <sstream>
+#include <sys/stat.h>
+
+#ifdef _WIN32
+#  include <io.h>
+#else
+#  include <unistd.h>
+#endif
+
+namespace zim {
+
+FileCompound::FileCompound(const std::string& filename):
+  _fsize(0)
+{
+  try {
+    auto part = new FilePart<>(filename);
+    emplace(Range(offset_t(0), offset_t(part->size().v)), part);
+    _fsize = part->size();
+  } catch(...) {
+    int errnoSave = errno;
+    _fsize = zsize_t(0);
+    for (char ch0 = 'a'; ch0 <= 'z'; ++ch0)
+    {
+      std::string fname0 = filename + ch0;
+      for (char ch1 = 'a'; ch1 <= 'z'; ++ch1)
+      {
+        std::string fname1 = fname0 + ch1;
+
+        try {
+          auto currentPart = new FilePart<>(fname1);
+          emplace(Range(offset_t(_fsize.v), offset_t((_fsize+currentPart->size()).v)), currentPart);
+          _fsize += currentPart->size();
+        } catch (...) {
+          break;
+        }
+      }
+    }
+
+    if (empty())
+    {
+      std::ostringstream msg;
+      msg << "error " << errnoSave << " opening file \"" << filename;
+      throw std::runtime_error(msg.str());
+    }
+  }
+}
+
+FileCompound::FileCompound(FilePart<>* filePart):
+  _fsize(0)
+{
+  emplace(Range(offset_t(0), offset_t(filePart->size().v)), filePart);
+  _fsize = filePart->size();
+}
+
+FileCompound::~FileCompound() {
+  for(auto it=begin(); it!=end(); it++) {
+    auto filepart = it->second;
+    delete filepart;
+  }
+}
+
+time_t FileCompound::getMTime() const {
+  if (mtime || empty())
+    return mtime;
+
+  const char* fname = begin()->second->filename().c_str();
+
+  #if defined(HAVE_STAT64) && ! defined(__APPLE__)
+    struct stat64 st;
+    int ret = ::stat64(fname, &st);
+  #else
+    struct stat st;
+    int ret = ::stat(fname, &st);
+  #endif
+  if (ret != 0)
+  {
+    std::ostringstream msg;
+    msg << "stat failed with errno " << errno << " : " << strerror(errno);
+    throw std::runtime_error(msg.str());
+  }
+  mtime = st.st_mtime;
+
+  return mtime;
+
+}
+
+} // zim
diff --git a/src/file_compound.h b/src/file_compound.h

new file mode 100644 (file)

index 0000000..a6b7490
--- /dev/null
+++ b/src/file_compound.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILE_COMPOUND_H_
+#define ZIM_FILE_COMPOUND_H_
+
+#include "file_part.h"
+#include "zim_types.h"
+#include <map>
+#include <memory>
+#include <cstdio>
+
+namespace zim {
+
+class FileReader;
+
+struct Range {
+  Range(const offset_t point ) : min(point), max(point) {}
+  Range(const offset_t  min, const offset_t max) : min(min), max(max) {}
+  const offset_t min;
+  const offset_t max;
+};
+
+struct less_range : public std::binary_function< Range, Range, bool>
+{
+  bool operator()(const Range& lhs, const Range& rhs) const {
+    return lhs.min < rhs.min && lhs.max <= rhs.min;
+  }
+};
+
+class FileCompound : public std::map<Range, FilePart<>*, less_range> {
+  public:
+    FileCompound(const std::string& filename);
+    FileCompound(FilePart<>* fpart);
+    ~FileCompound();
+
+    zsize_t fsize() const { return _fsize; };
+    time_t getMTime() const;
+    bool fail() const { return empty(); };
+    bool is_multiPart() const { return size() > 1; };
+
+    std::pair<FileCompound::const_iterator, FileCompound::const_iterator>
+    locate(offset_t offset, zsize_t size) const {
+        return equal_range(Range(offset, offset+size));
+    }
+
+  private:
+    zsize_t _fsize;
+    mutable time_t mtime;
+};
+
+
+};
+
+
+#endif //ZIM_FILE_COMPOUND_H_
diff --git a/src/file_part.h b/src/file_part.h

new file mode 100644 (file)

index 0000000..3867d29
--- /dev/null
+++ b/src/file_part.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILE_PART_H_
+#define ZIM_FILE_PART_H_
+
+#include <string>
+#include <cstdio>
+
+#include <zim/zim.h>
+
+#include "zim_types.h"
+#include "fs.h"
+
+namespace zim {
+
+template<typename FS=DEFAULTFS>
+class FilePart {
+  public:
+    FilePart(const std::string& filename) :
+        m_filename(filename),
+        m_fhandle(FS::openFile(filename)),
+        m_size(m_fhandle.getSize()) {}
+    FilePart(int fd) :
+        m_filename(""),
+        m_fhandle(fd),
+        m_size(m_fhandle.getSize()) {}
+    ~FilePart() = default;
+    const std::string& filename() const { return m_filename; };
+    const typename FS::FD& fhandle() const { return m_fhandle; };
+
+    zsize_t size() const { return m_size; };
+    bool fail() const { return !m_size; };
+    bool good() const { return bool(m_size); };
+
+  private:
+    const std::string m_filename;
+    typename FS::FD m_fhandle;
+    zsize_t m_size;
+};
+
+};
+
+#endif //ZIM_FILE_PART_H_
diff --git a/src/file_reader.cpp b/src/file_reader.cpp

new file mode 100644 (file)

index 0000000..64eb5d5
--- /dev/null
+++ b/src/file_reader.cpp
@@ -0,0 +1,273 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/zim.h>
+#include <zim/error.h>
+#include "file_reader.h"
+#include "file_compound.h"
+#include "cluster.h"
+#include "buffer.h"
+#include "compression.h"
+#include <errno.h>
+#include <string.h>
+#include <cstring>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sstream>
+#include <system_error>
+#include <algorithm>
+
+
+#if defined(_MSC_VER)
+# include <io.h>
+# include <BaseTsd.h>
+  typedef SSIZE_T ssize_t;
+#endif
+
+namespace zim {
+
+FileReader::FileReader(std::shared_ptr<const FileCompound> source)
+  : FileReader(source, offset_t(0), source->fsize()) {}
+
+FileReader::FileReader(std::shared_ptr<const FileCompound> source, offset_t offset)
+  : FileReader(source, offset, zsize_t(source->fsize().v-offset.v)) {}
+
+FileReader::FileReader(std::shared_ptr<const FileCompound> source, offset_t offset, zsize_t size)
+  : source(source),
+    _offset(offset),
+    _size(size)
+{
+  ASSERT(offset.v, <=, source->fsize().v);
+  ASSERT(offset.v+size.v, <=, source->fsize().v);
+}
+
+char FileReader::read(offset_t offset) const {
+  ASSERT(offset.v, <, _size.v);
+  offset += _offset;
+  auto part_pair = source->lower_bound(offset);
+  auto& fhandle = part_pair->second->fhandle();
+  offset_t local_offset = offset - part_pair->first.min;
+  ASSERT(local_offset, <=, part_pair->first.max);
+  char ret;
+  try {
+    fhandle.readAt(&ret, zsize_t(1), local_offset);
+  } catch (std::runtime_error& e) {
+    //Error while reading.
+    std::ostringstream s;
+    s << "Cannot read a char.\n";
+    s << " - File part is " <<  part_pair->second->filename() << "\n";
+    s << " - File part size is " << part_pair->second->size().v << "\n";
+    s << " - File part range is " << part_pair->first.min << "-" << part_pair->first.max << "\n";
+    s << " - Reading offset at " << offset.v << "\n";
+    s << " - local offset is " << local_offset.v << "\n";
+    s << " - error is " << strerror(errno) << "\n";
+    std::error_code ec(errno, std::generic_category());
+    throw std::system_error(ec, s.str());
+  };
+  return ret;
+}
+
+
+void FileReader::read(char* dest, offset_t offset, zsize_t size) const {
+  ASSERT(offset.v, <, _size.v);
+  ASSERT(offset.v+size.v, <=, _size.v);
+  if (! size ) {
+    return;
+  }
+  offset += _offset;
+  auto found_range = source->locate(offset, size);
+  for(auto current = found_range.first; current!=found_range.second; current++){
+    auto part = current->second;
+    Range partRange = current->first;
+    offset_t local_offset = offset-partRange.min;
+    ASSERT(size.v, >, 0U);
+    zsize_t size_to_get = zsize_t(std::min(size.v, part->size().v-local_offset.v));
+    try {
+      part->fhandle().readAt(dest, size_to_get, local_offset);
+    } catch (std::runtime_error& e) {
+      std::ostringstream s;
+      s << "Cannot read chars.\n";
+      s << " - File part is " <<  part->filename() << "\n";
+      s << " - File part size is " << part->size().v << "\n";
+      s << " - File part range is " << partRange.min << "-" << partRange.max << "\n";
+      s << " - size_to_get is " << size_to_get.v << "\n";
+      s << " - total size is " << size.v << "\n";
+      s << " - Reading offset at " << offset.v << "\n";
+      s << " - local offset is " << local_offset.v << "\n";
+      s << " - error is " << strerror(errno) << "\n";
+      std::error_code ec(errno, std::generic_category());
+      throw std::system_error(ec, s.str());
+    };
+    ASSERT(size_to_get, <=, size);
+    dest += size_to_get.v;
+    size -= size_to_get;
+    offset += size_to_get;
+  }
+  ASSERT(size.v, ==, 0U);
+}
+
+
+std::shared_ptr<const Buffer> FileReader::get_buffer(offset_t offset, zsize_t size) const {
+  ASSERT(size, <=, _size);
+#ifdef ENABLE_USE_MMAP
+  try {
+    auto found_range = source->locate(_offset+offset, size);
+    auto first_part_containing_it = found_range.first;
+    if (++first_part_containing_it != found_range.second) {
+      throw MMapException();
+    }
+
+    // The range is in only one part
+    auto range = found_range.first->first;
+    auto part = found_range.first->second;
+    auto local_offset = offset + _offset - range.min;
+    ASSERT(size, <=, part->size());
+    int fd = part->fhandle().getNativeHandle();
+    auto buffer = std::shared_ptr<const Buffer>(new MMapBuffer(fd, local_offset, size));
+    return buffer;
+  } catch(MMapException& e)
+#endif
+  {
+    // The range is several part, or we are on Windows.
+    // We will have to do some memory copies :/
+    // [TODO] Use Windows equivalent for mmap.
+    char* p = new char[size.v];
+    auto ret_buffer = std::shared_ptr<const Buffer>(new MemoryBuffer<true>(p, size));
+    read(p, offset, size);
+    return ret_buffer;
+  }
+}
+
+bool Reader::can_read(offset_t offset, zsize_t size)
+{
+    return (offset.v <= this->size().v && (offset.v+size.v) <= this->size().v);
+}
+
+
+std::shared_ptr<const Buffer> Reader::get_clusterBuffer(offset_t offset, CompressionType comp) const
+{
+  zsize_t uncompressed_size(0);
+  std::unique_ptr<char[]> uncompressed_data;
+  switch (comp) {
+    case zimcompLzma:
+      uncompressed_data = uncompress<LZMA_INFO>(this, offset, &uncompressed_size);
+      break;
+    case zimcompZip:
+#if defined(ENABLE_ZLIB)
+      uncompressed_data = uncompress<ZIP_INFO>(this, offset, &uncompressed_size);
+#else
+      throw std::runtime_error("zlib not enabled in this library");
+#endif
+      break;
+    case zimcompZstd:
+#if defined(ENABLE_ZSTD)
+      uncompressed_data = uncompress<ZSTD_INFO>(this, offset, &uncompressed_size);
+#else
+      throw std::runtime_error("zstd not enabled in this library");
+#endif
+      break;
+    default:
+      throw std::logic_error("compressions should not be something else than zimcompLzma, zimComZip or zimcompZstd.");
+  }
+  return std::shared_ptr<const Buffer>(new MemoryBuffer<true>(uncompressed_data.release(), uncompressed_size));
+}
+
+std::unique_ptr<const Reader> Reader::sub_clusterReader(offset_t offset, CompressionType* comp, bool* extended) const {
+  uint8_t clusterInfo = read(offset);
+  *comp = static_cast<CompressionType>(clusterInfo & 0x0F);
+  *extended = clusterInfo & 0x10;
+
+  switch (*comp) {
+    case zimcompDefault:
+    case zimcompNone:
+      {
+        auto size = Cluster::read_size(this, *extended, offset + offset_t(1));
+      // No compression, just a sub_reader
+        return sub_reader(offset+offset_t(1), size);
+      }
+      break;
+    case zimcompLzma:
+    case zimcompZip:
+    case zimcompZstd:
+      {
+        auto buffer = get_clusterBuffer(offset+offset_t(1), *comp);
+        return std::unique_ptr<Reader>(new BufferReader(buffer));
+      }
+      break;
+    case zimcompBzip2:
+      throw std::runtime_error("bzip2 not enabled in this library");
+    default:
+      throw ZimFileFormatError("Invalid compression flag");
+  }
+}
+
+std::unique_ptr<const Reader> FileReader::sub_reader(offset_t offset, zsize_t size) const
+{
+  ASSERT(size, <=, _size);
+  return std::unique_ptr<Reader>(new FileReader(source, _offset+offset, size));
+}
+
+
+//BufferReader::BufferReader(std::shared_ptr<Buffer> source)
+//  : source(source) {}
+
+std::shared_ptr<const Buffer> BufferReader::get_buffer(offset_t offset, zsize_t size) const
+{
+  return source->sub_buffer(offset, size);
+}
+
+std::unique_ptr<const Reader> BufferReader::sub_reader(offset_t offset, zsize_t size) const
+{
+  //auto source_addr = source->data(0);
+  auto sub_buff = get_buffer(offset, size);
+  //auto buff_addr = sub_buff->data(0);
+  std::unique_ptr<const Reader> sub_read(new BufferReader(sub_buff));
+  return sub_read;
+}
+
+zsize_t BufferReader::size() const
+{
+  return source->size();
+}
+
+offset_t BufferReader::offset() const
+{
+  return offset_t((offset_type)(static_cast<const void*>(source->data(offset_t(0)))));
+}
+
+
+void BufferReader::read(char* dest, offset_t offset, zsize_t size) const {
+  ASSERT(offset.v, <, source->size().v);
+  ASSERT(offset+offset_t(size.v), <=, offset_t(source->size().v));
+  if (! size ) {
+    return;
+  }
+  memcpy(dest, source->data(offset), size.v);
+}
+
+
+char BufferReader::read(offset_t offset) const {
+  ASSERT(offset.v, <, source->size().v);
+  char dest;
+  dest = *source->data(offset);
+  return dest;
+}
+
+
+} // zim
diff --git a/src/file_reader.h b/src/file_reader.h

new file mode 100644 (file)

index 0000000..8fb1d44
--- /dev/null
+++ b/src/file_reader.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILE_READER_H_
+#define ZIM_FILE_READER_H_
+
+#include <memory>
+
+#include "zim_types.h"
+#include "endian_tools.h"
+#include "debug.h"
+
+namespace zim {
+
+class Buffer;
+class FileCompound;
+
+class Reader {
+  public:
+    Reader() {};
+    virtual zsize_t size() const = 0;
+    virtual ~Reader() {};
+
+    virtual void read(char* dest, offset_t offset, zsize_t size) const = 0;
+    template<typename T>
+    T read(offset_t offset) const {
+      ASSERT(offset.v, <, size().v);
+      ASSERT(offset.v+sizeof(T), <=, size().v);
+      char tmp_buf[sizeof(T)];
+      read(tmp_buf, offset, zsize_t(sizeof(T)));
+      return fromLittleEndian<T>(tmp_buf);
+    }
+    virtual char read(offset_t offset) const = 0;
+
+    virtual std::shared_ptr<const Buffer> get_buffer(offset_t offset, zsize_t size) const = 0;
+    std::shared_ptr<const Buffer> get_buffer(offset_t offset) const {
+      return get_buffer(offset, zsize_t(size().v-offset.v));
+    }
+    virtual std::unique_ptr<const Reader> sub_reader(offset_t offset, zsize_t size) const = 0;
+    std::unique_ptr<const Reader> sub_reader(offset_t offset) const {
+      return sub_reader(offset, zsize_t(size().v-offset.v));
+    }
+    virtual offset_t offset() const = 0;
+
+    std::unique_ptr<const Reader> sub_clusterReader(offset_t offset,
+                                                    CompressionType* comp,
+                                                    bool* extented) const;
+
+    bool can_read(offset_t offset, zsize_t size);
+
+  private:
+    std::shared_ptr<const Buffer> get_clusterBuffer(offset_t offset, CompressionType comp) const;
+};
+
+class FileReader : public Reader {
+  public:
+    FileReader(std::shared_ptr<const FileCompound> source);
+    ~FileReader() {};
+
+    zsize_t size() const { return _size; };
+    offset_t offset() const { return _offset; };
+
+    char read(offset_t offset) const;
+    void read(char* dest, offset_t offset, zsize_t size) const;
+    std::shared_ptr<const Buffer> get_buffer(offset_t offset, zsize_t size) const;
+
+    std::unique_ptr<const Reader> sub_reader(offset_t offest, zsize_t size) const;
+
+  private:
+    FileReader(std::shared_ptr<const FileCompound> source, offset_t offset);
+    FileReader(std::shared_ptr<const FileCompound> source, offset_t offset, zsize_t size);
+
+    std::shared_ptr<const FileCompound> source;
+    offset_t _offset;
+    zsize_t _size;
+};
+
+class BufferReader : public Reader {
+  public:
+    BufferReader(std::shared_ptr<const Buffer> source)
+      : source(source) {}
+    virtual ~BufferReader() {};
+
+    zsize_t size() const;
+    offset_t offset() const;
+
+    void read(char* dest, offset_t offset, zsize_t size) const;
+    char read(offset_t offset) const;
+    std::shared_ptr<const Buffer> get_buffer(offset_t offset, zsize_t size) const;
+    std::unique_ptr<const Reader> sub_reader(offset_t offset, zsize_t size) const;
+
+  private:
+    std::shared_ptr<const Buffer> source;
+};
+
+};
+
+#endif // ZIM_FILE_READER_H_
diff --git a/src/fileheader.cpp b/src/fileheader.cpp

new file mode 100644 (file)

index 0000000..0e48990
--- /dev/null
+++ b/src/fileheader.cpp
@@ -0,0 +1,131 @@
+/*
+ * Copyright (C) 2008 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/fileheader.h>
+#include <zim/error.h>
+#include <iostream>
+#include <algorithm>
+#include "log.h"
+#include "endian_tools.h"
+#include "buffer.h"
+#ifdef _WIN32
+# include "io.h"
+#else
+# include "unistd.h"
+# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \
+{throw std::runtime_error("Error writing");}
+#endif
+
+log_define("zim.file.header")
+
+namespace zim
+{
+  const uint32_t Fileheader::zimMagic = 0x044d495a; // ="ZIM^d"
+  const uint16_t Fileheader::zimClassicMajorVersion = 5;
+  const uint16_t Fileheader::zimExtendedMajorVersion = 6;
+  const uint16_t Fileheader::zimMinorVersion = 0;
+  const offset_type Fileheader::size = 80; // This is also mimeListPos (so an offset)
+
+  void Fileheader::write(int out_fd) const
+  {
+    char header[Fileheader::size];
+    toLittleEndian(Fileheader::zimMagic, header);
+    toLittleEndian(getMajorVersion(), header + 4);
+    toLittleEndian(getMinorVersion(), header + 6);
+    std::copy(getUuid().data, getUuid().data + sizeof(Uuid), header + 8);
+    toLittleEndian(getArticleCount(), header + 24);
+    toLittleEndian(getClusterCount(), header + 28);
+    toLittleEndian(getUrlPtrPos(), header + 32);
+    toLittleEndian(getTitleIdxPos(), header + 40);
+    toLittleEndian(getClusterPtrPos(), header + 48);
+    toLittleEndian(getMimeListPos(), header + 56);
+    toLittleEndian(getMainPage(), header + 64);
+    toLittleEndian(getLayoutPage(), header + 68);
+    toLittleEndian(getChecksumPos(), header + 72);
+
+    _write(out_fd, header, Fileheader::size);
+  }
+
+  void Fileheader::read(std::shared_ptr<const Buffer> buffer)
+  {
+    uint32_t magicNumber = buffer->as<uint32_t>(offset_t(0));
+    if (magicNumber != Fileheader::zimMagic)
+    {
+      log_error("invalid magic number " << magicNumber << " found - "
+          << Fileheader::zimMagic << " expected");
+      throw ZimFileFormatError("Invalid magic number");
+    }
+
+    uint16_t major_version = buffer->as<uint16_t>(offset_t(4));
+    if (major_version != zimClassicMajorVersion && major_version != zimExtendedMajorVersion)
+    {
+      log_error("invalid zimfile major version " << major_version << " found - "
+          << Fileheader::zimMajorVersion << " expected");
+      throw ZimFileFormatError("Invalid version");
+    }
+    setMajorVersion(major_version);
+
+    setMinorVersion(buffer->as<uint16_t>(offset_t(6)));
+
+    Uuid uuid;
+    std::copy(buffer->data(offset_t(8)), buffer->data(offset_t(24)), uuid.data);
+    setUuid(uuid);
+
+    setArticleCount(buffer->as<uint32_t>(offset_t(24)));
+    setClusterCount(buffer->as<uint32_t>(offset_t(28)));
+    setUrlPtrPos(buffer->as<uint64_t>(offset_t(32)));
+    setTitleIdxPos(buffer->as<uint64_t>(offset_t(40)));
+    setClusterPtrPos(buffer->as<uint64_t>(offset_t(48)));
+    setMimeListPos(buffer->as<uint64_t>(offset_t(56)));
+    setMainPage(buffer->as<uint32_t>(offset_t(64)));
+    setLayoutPage(buffer->as<uint32_t>(offset_t(68)));
+    setChecksumPos(buffer->as<uint64_t>(offset_t(72)));
+
+    sanity_check();
+  }
+
+  void Fileheader::sanity_check() const {
+    if (!!articleCount != !!clusterCount) {
+      throw ZimFileFormatError("No article <=> No cluster");
+    }
+
+    if (mimeListPos != size && mimeListPos != 72) {
+      throw ZimFileFormatError("mimelistPos must be 80.");
+    }
+
+    if (urlPtrPos < mimeListPos) {
+      throw ZimFileFormatError("urlPtrPos must be > mimelistPos.");
+    }
+    if (titleIdxPos < mimeListPos) {
+      throw ZimFileFormatError("titleIdxPos must be > mimelistPos.");
+    }
+    if (clusterPtrPos < mimeListPos) {
+      throw ZimFileFormatError("clusterPtrPos must be > mimelistPos.");
+    }
+
+    if (clusterCount > articleCount) {
+      throw ZimFileFormatError("Cluster count cannot be higher than article count.");
+    }
+
+    if (checksumPos != 0 && checksumPos < mimeListPos) {
+      throw ZimFileFormatError("checksumPos must be > mimeListPos.");
+    }
+  }
+
+}
diff --git a/src/fileimpl.cpp b/src/fileimpl.cpp

new file mode 100644 (file)

index 0000000..bc4d0f3
--- /dev/null
+++ b/src/fileimpl.cpp
@@ -0,0 +1,602 @@
+/*
+ * Copyright (C) 2006,2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "fileimpl.h"
+#include <zim/error.h>
+#include "_dirent.h"
+#include "file_compound.h"
+#include "file_reader.h"
+#include <pthread.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sstream>
+#include <errno.h>
+#include <cstring>
+#include <fstream>
+#include "config.h"
+#include "log.h"
+#include "envvalue.h"
+#include "md5.h"
+
+log_define("zim.file.impl")
+
+namespace zim
+{
+  //////////////////////////////////////////////////////////////////////
+  // FileImpl
+  //
+  FileImpl::FileImpl(const std::string& fname)
+    : zimFile(new FileCompound(fname)),
+      zimReader(new FileReader(zimFile)),
+      bufferDirentZone(256),
+      bufferDirentLock(PTHREAD_MUTEX_INITIALIZER),
+      filename(fname),
+      direntCache(envValue("ZIM_DIRENTCACHE", DIRENT_CACHE_SIZE)),
+      direntCacheLock(PTHREAD_MUTEX_INITIALIZER),
+      clusterCache(envValue("ZIM_CLUSTERCACHE", CLUSTER_CACHE_SIZE)),
+      clusterCacheLock(PTHREAD_MUTEX_INITIALIZER),
+      cacheUncompressedCluster(envValue("ZIM_CACHEUNCOMPRESSEDCLUSTER", false)),
+      namespaceBeginLock(PTHREAD_MUTEX_INITIALIZER),
+      namespaceEndLock(PTHREAD_MUTEX_INITIALIZER)
+  {
+    log_trace("read file \"" << fname << '"');
+
+    if (zimFile->fail())
+      throw ZimFileFormatError(std::string("can't open zim-file \"") + fname + '"');
+
+    filename = fname;
+
+    // read header
+    if (size_type(zimReader->size()) < Fileheader::size) {
+      throw ZimFileFormatError("zim-file is too small to contain a header");
+    }
+    try {
+      header.read(zimReader->get_buffer(offset_t(0), zsize_t(Fileheader::size)));
+    } catch (ZimFileFormatError& e) {
+      throw e;
+    } catch (...) {
+      throw ZimFileFormatError("error reading zim-file header.");
+    }
+
+    // urlPtrOffsetReader
+    zsize_t size(header.getArticleCount() * 8);
+    if (!zimReader->can_read(offset_t(header.getUrlPtrPos()), size)) {
+      throw ZimFileFormatError("Reading out of zim file.");
+    }
+#ifdef ENABLE_USE_BUFFER_HEADER
+    urlPtrOffsetReader = std::unique_ptr<Reader>(new BufferReader(
+       zimReader->get_buffer(offset_t(header.getUrlPtrPos()), size)));
+#else
+    urlPtrOffsetReader = zimReader->sub_reader(offset_t(header.getUrlPtrPos()), size);
+#endif
+
+    // Create titleIndexBuffer
+    size = zsize_t(header.getArticleCount() * 4);
+    if (!zimReader->can_read(offset_t(header.getTitleIdxPos()), size)) {
+      throw ZimFileFormatError("Reading out of zim file.");
+    }
+#ifdef ENABLE_USE_BUFFER_HEADER
+    titleIndexReader = std::unique_ptr<Reader>(new BufferReader(
+        zimReader->get_buffer(offset_t(header.getTitleIdxPos()), size)));
+#else
+    titleIndexReader = zimReader->sub_reader(offset_t(header.getTitleIdxPos()), size);
+#endif
+
+    // clusterOffsetBuffer
+    size = zsize_t(header.getClusterCount() * 8);
+    if (!zimReader->can_read(offset_t(header.getClusterPtrPos()), size)) {
+      throw ZimFileFormatError("Reading out of zim file.");
+    }
+#ifdef ENABLE_USE_BUFFER_HEADER
+    clusterOffsetReader = std::unique_ptr<Reader>(new BufferReader(
+        zimReader->get_buffer(offset_t(header.getClusterPtrPos()), size)));
+#else
+    clusterOffsetReader = zimReader->sub_reader(offset_t(header.getClusterPtrPos()), size);
+#endif
+
+    if (!getCountClusters())
+      log_warn("no clusters found");
+    else
+    {
+      offset_t lastOffset = getClusterOffset(cluster_index_t(cluster_index_type(getCountClusters()) - 1));
+      log_debug("last offset=" << lastOffset.v << " file size=" << zimFile->fsize().v);
+      if (lastOffset.v > zimFile->fsize().v)
+      {
+        log_fatal("last offset (" << lastOffset << ") larger than file size (" << zimFile->fsize() << ')');
+        throw ZimFileFormatError("last cluster offset larger than file size; file corrupt");
+      }
+    }
+
+    if (header.hasChecksum() && header.getChecksumPos() != (zimFile->fsize().v-16) ) {
+      throw ZimFileFormatError("Checksum position is not valid");
+    }
+
+    // read mime types
+    size = zsize_t(header.getUrlPtrPos() - header.getMimeListPos());
+    // No need to check access, getUrlPtrPos is in the zim file, and we are
+    // sure that getMimeListPos is 80.
+    auto buffer = zimReader->get_buffer(offset_t(header.getMimeListPos()), size);
+    offset_t current = offset_t(0);
+    while (current.v < size.v)
+    {
+      offset_type len = strlen(buffer->data(current));
+
+      if (len == 0) {
+        break;
+      }
+
+      if (current.v + len >= size.v) {
+       throw(ZimFileFormatError("Error getting mimelists."));
+      }
+
+      std::string mimeType(buffer->data(current), len);
+      mimeTypes.push_back(mimeType);
+
+      current += (len + 1);
+    }
+  }
+
+
+  std::pair<bool, article_index_t> FileImpl::findx(char ns, const std::string& url)
+  {
+    log_debug("find article by url " << ns << " \"" << url << "\",  in file \"" << getFilename() << '"');
+
+    article_index_type l = article_index_type(getNamespaceBeginOffset(ns));
+    article_index_type u = article_index_type(getNamespaceEndOffset(ns));
+
+    if (l == u)
+    {
+      log_debug("namespace " << ns << " not found");
+      return std::pair<bool, article_index_t>(false, article_index_t(0));
+    }
+
+    unsigned itcount = 0;
+    while (u - l > 1)
+    {
+      ++itcount;
+      article_index_type p = l + (u - l) / 2;
+      auto d = getDirent(article_index_t(p));
+
+      int c = ns < d->getNamespace() ? -1
+            : ns > d->getNamespace() ? 1
+            : url.compare(d->getUrl());
+
+      if (c < 0)
+        u = p;
+      else if (c > 0)
+        l = p;
+      else
+      {
+        log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p);
+        return std::pair<bool, article_index_t>(true, article_index_t(p));
+      }
+    }
+
+    auto d = getDirent(article_index_t(l));
+    int c = url.compare(d->getUrl());
+
+    if (c == 0)
+    {
+      log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l);
+      return std::pair<bool, article_index_t>(true, article_index_t(l));
+    }
+
+    log_debug("article not found after " << itcount << " iterations (\"" << d.getUrl() << "\" does not match)");
+    return std::pair<bool, article_index_t>(false, article_index_t(c < 0 ? l : u));
+  }
+
+  std::pair<bool, article_index_t> FileImpl::findx(const std::string& url)
+  {
+    size_t start = 0;
+    if (url[0] == '/') {
+      start = 1;
+    }
+    if (url.size() < (2+start) || url[1+start] != '/')
+      return std::pair<bool, article_index_t>(false, article_index_t(0));
+    return findx(url[start], url.substr(2+start));
+  }
+
+  std::pair<bool, article_index_t> FileImpl::findxByTitle(char ns, const std::string& title)
+  {
+    log_debug("find article by title " << ns << " \"" << title << "\", in file \"" << getFilename() << '"');
+
+    article_index_type l = article_index_type(getNamespaceBeginOffset(ns));
+    article_index_type u = article_index_type(getNamespaceEndOffset(ns));
+
+    if (l == u)
+    {
+      log_debug("namespace " << ns << " not found");
+      return std::pair<bool, article_index_t>(false, article_index_t(0));
+    }
+
+    unsigned itcount = 0;
+    while (u - l > 1)
+    {
+      ++itcount;
+      article_index_type p = l + (u - l) / 2;
+      auto d = getDirentByTitle(article_index_t(p));
+
+      int c = ns < d->getNamespace() ? -1
+            : ns > d->getNamespace() ? 1
+            : title.compare(d->getTitle());
+
+      if (c < 0)
+        u = p;
+      else if (c > 0)
+        l = p;
+      else
+      {
+        log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << p);
+        return std::pair<bool, article_index_t>(true, article_index_t(p));
+      }
+    }
+
+    auto d = getDirentByTitle(article_index_t(l));
+    int c = title.compare(d->getTitle());
+
+    if (c == 0)
+    {
+      log_debug("article found after " << itcount << " iterations in file \"" << getFilename() << "\" at index " << l);
+      return std::pair<bool, article_index_t>(true, article_index_t(l));
+    }
+
+    log_debug("article not found after " << itcount << " iterations (\"" << d.getTitle() << "\" does not match)");
+    return std::pair<bool, article_index_t>(false, article_index_t(c < 0 ? l : u));
+  }
+
+  std::pair<bool, article_index_t> FileImpl::findxByClusterOrder(article_index_type idx)
+  {
+      std::call_once(orderOnceFlag, [this]
+      {
+          auto nb_articles = this->getCountArticles().v;
+          articleListByCluster.reserve(nb_articles);
+
+          for(zim::article_index_type i = 0; i < nb_articles; i++)
+          {
+              articleListByCluster.push_back(std::make_pair(this->getDirent(article_index_t(i))->getClusterNumber().v, i));
+          }
+          std::sort(articleListByCluster.begin(), articleListByCluster.end());
+      });
+
+      if (idx >= articleListByCluster.size())
+          return std::pair<bool, article_index_t>(false, article_index_t(0));
+      return std::pair<bool, article_index_t>(true, article_index_t(articleListByCluster[idx].second));
+  }
+
+  std::pair<FileCompound::const_iterator, FileCompound::const_iterator>
+  FileImpl::getFileParts(offset_t offset, zsize_t size)
+  {
+    return zimFile->locate(offset, size);
+  }
+
+  std::shared_ptr<const Dirent> FileImpl::getDirent(article_index_t idx)
+  {
+    log_trace("FileImpl::getDirent(" << idx << ')');
+
+    if (idx >= getCountArticles())
+      throw ZimFileFormatError("article index out of range");
+
+    pthread_mutex_lock(&direntCacheLock);
+    auto v = direntCache.getx(idx);
+    if (v.first)
+    {
+      log_debug("dirent " << idx << " found in cache; hits "
+                << direntCache.getHits() << " misses "
+                << direntCache.getMisses() << " ratio "
+                << direntCache.hitRatio() * 100 << "% fillfactor "
+                << direntCache.fillfactor());
+      pthread_mutex_unlock(&direntCacheLock);
+      return v.second;
+    }
+
+    log_debug("dirent " << idx << " not found in cache; hits "
+              << direntCache.getHits() << " misses " << direntCache.getMisses()
+              << " ratio " << direntCache.hitRatio() * 100 << "% fillfactor "
+              << direntCache.fillfactor());
+    pthread_mutex_unlock(&direntCacheLock);
+
+    offset_t indexOffset = getOffset(urlPtrOffsetReader.get(), idx.v);
+    // We don't know the size of the dirent because it depends of the size of
+    // the title, url and extra parameters.
+    // This is a pitty but we have no choices.
+    // We cannot take a buffer of the size of the file, it would be really inefficient.
+    // Let's do try, catch and retry while chosing a smart value for the buffer size.
+    // Most dirent will be "Article" entry (header's size == 16) without extra parameters.
+    // Let's hope that url + title size will be < 256 and if not try again with a bigger size.
+
+    pthread_mutex_lock(&bufferDirentLock);
+    zsize_t bufferSize = zsize_t(256);
+    // On very small file, the offset + 256 is higher than the size of the file,
+    // even if the file is valid.
+    // So read only to the end of the file.
+    auto totalSize = zimReader->size();
+    if (indexOffset.v + 256 > totalSize.v) bufferSize = zsize_t(totalSize.v-indexOffset.v);
+    std::shared_ptr<const Dirent> dirent;
+    while (true) {
+        bufferDirentZone.reserve(size_type(bufferSize));
+        zimReader->read(bufferDirentZone.data(), indexOffset, bufferSize);
+        auto direntBuffer = std::unique_ptr<Buffer>(new MemoryBuffer<false>(bufferDirentZone.data(), bufferSize));
+        try {
+          dirent = std::make_shared<const Dirent>(std::move(direntBuffer));
+        } catch (InvalidSize&) {
+          // buffer size is not enougth, try again :
+          bufferSize += 256;
+          continue;
+        }
+        // Success !
+        break;
+    }
+    pthread_mutex_unlock(&bufferDirentLock);
+
+    log_debug("dirent read from " << indexOffset);
+    pthread_mutex_lock(&direntCacheLock);
+    direntCache.put(idx, dirent);
+    pthread_mutex_unlock(&direntCacheLock);
+
+    return dirent;
+  }
+
+  std::shared_ptr<const Dirent> FileImpl::getDirentByTitle(article_index_t idx)
+  {
+    if (idx >= getCountArticles())
+      throw ZimFileFormatError("article index out of range");
+    return getDirent(getIndexByTitle(idx));
+  }
+
+  article_index_t FileImpl::getIndexByTitle(article_index_t idx)
+  {
+    if (idx >= getCountArticles())
+      throw ZimFileFormatError("article index out of range");
+
+    article_index_t ret(titleIndexReader->read<article_index_type>(
+                            offset_t(sizeof(article_index_t)*idx.v)));
+
+    return ret;
+  }
+
+  std::shared_ptr<const Cluster> FileImpl::getCluster(cluster_index_t idx)
+  {
+    if (idx >= getCountClusters())
+      throw ZimFileFormatError("cluster index out of range");
+
+    pthread_mutex_lock(&clusterCacheLock);
+    auto cluster(clusterCache.get(idx));
+    pthread_mutex_unlock(&clusterCacheLock);
+    if (cluster)
+    {
+      log_debug("cluster " << idx << " found in cache; hits " << clusterCache.getHits() << " misses " << clusterCache.getMisses() << " ratio " << clusterCache.hitRatio() * 100 << "% fillfactor " << clusterCache.fillfactor());
+      return cluster;
+    }
+
+    offset_t clusterOffset(getClusterOffset(idx));
+    log_debug("read cluster " << idx << " from offset " << clusterOffset);
+    CompressionType comp;
+    bool extended;
+    std::shared_ptr<const Reader> reader = zimReader->sub_clusterReader(clusterOffset, &comp, &extended);
+    cluster = std::shared_ptr<Cluster>(new Cluster(reader, comp, extended));
+
+    log_debug("put cluster " << idx << " into cluster cache; hits " << clusterCache.getHits() << " misses " << clusterCache.getMisses() << " ratio " << clusterCache.hitRatio() * 100 << "% fillfactor " << clusterCache.fillfactor());
+    pthread_mutex_lock(&clusterCacheLock);
+    clusterCache.put(idx, cluster);
+    pthread_mutex_unlock(&clusterCacheLock);
+
+    return cluster;
+  }
+
+  offset_t FileImpl::getOffset(const Reader* reader, size_t idx)
+  {
+    offset_t offset(reader->read<offset_type>(offset_t(sizeof(offset_type)*idx)));
+    return offset;
+  }
+
+  offset_t FileImpl::getClusterOffset(cluster_index_t idx)
+  {
+    return getOffset(clusterOffsetReader.get(), idx.v);
+  }
+
+  offset_t FileImpl::getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx)
+  {
+    auto cluster = getCluster(clusterIdx);
+    if (cluster->isCompressed())
+      return offset_t(0);
+    return getClusterOffset(clusterIdx) + offset_t(1) + cluster->getBlobOffset(blobIdx);
+  }
+
+  article_index_t FileImpl::getNamespaceBeginOffset(char ch)
+  {
+    log_trace("getNamespaceBeginOffset(" << ch << ')');
+
+    pthread_mutex_lock(&namespaceBeginLock);
+    NamespaceCache::const_iterator it = namespaceBeginCache.find(ch);
+    if (it != namespaceBeginCache.end())
+    {
+      article_index_t ret(it->second);
+      pthread_mutex_unlock(&namespaceBeginLock);
+      return ret;
+    }
+    pthread_mutex_unlock(&namespaceBeginLock);
+
+    article_index_type lower = 0;
+    article_index_type upper = article_index_type(getCountArticles());
+    auto d = getDirent(article_index_t(0));
+    while (upper - lower > 1)
+    {
+      article_index_type m = lower + (upper - lower) / 2;
+      auto d = getDirent(article_index_t(m));
+      if (d->getNamespace() >= ch)
+        upper = m;
+      else
+        lower = m;
+    }
+
+    article_index_t ret = article_index_t(d->getNamespace() < ch ? upper : lower);
+    pthread_mutex_lock(&namespaceBeginLock);
+    namespaceBeginCache[ch] = ret;
+    pthread_mutex_unlock(&namespaceBeginLock);
+
+    return ret;
+  }
+
+  article_index_t FileImpl::getNamespaceEndOffset(char ch)
+  {
+    log_trace("getNamespaceEndOffset(" << ch << ')');
+
+    pthread_mutex_lock(&namespaceEndLock);
+    NamespaceCache::const_iterator it = namespaceEndCache.find(ch);
+    if (it != namespaceEndCache.end())
+    {
+      article_index_t ret = it->second;
+      pthread_mutex_unlock(&namespaceEndLock);
+      return ret;
+    }
+    pthread_mutex_unlock(&namespaceEndLock);
+
+    article_index_type lower = 0;
+    article_index_type upper = article_index_type(getCountArticles());
+    log_debug("namespace " << ch << " lower=" << lower << " upper=" << upper);
+    while (upper - lower > 1)
+    {
+      article_index_type m = lower + (upper - lower) / 2;
+      auto d = getDirent(article_index_t(m));
+      if (d->getNamespace() > ch)
+        upper = m;
+      else
+        lower = m;
+      log_debug("namespace " << d->getNamespace() << " m=" << m << " lower=" << lower << " upper=" << upper);
+    }
+
+    pthread_mutex_lock(&namespaceEndLock);
+    namespaceEndCache[ch] = article_index_t(upper);
+    pthread_mutex_unlock(&namespaceEndLock);
+
+    return article_index_t(upper);
+  }
+
+  std::string FileImpl::getNamespaces()
+  {
+    std::string namespaces;
+
+    auto d = getDirent(article_index_t(0));
+    namespaces = d->getNamespace();
+
+    article_index_t idx(0);
+    while ((idx = getNamespaceEndOffset(d->getNamespace())) < getCountArticles())
+    {
+      d = getDirent(idx);
+      namespaces += d->getNamespace();
+    }
+
+    return namespaces;
+  }
+
+  const std::string& FileImpl::getMimeType(uint16_t idx) const
+  {
+    if (idx > mimeTypes.size())
+    {
+      std::ostringstream msg;
+      msg << "unknown mime type code " << idx;
+      throw std::runtime_error(msg.str());
+    }
+
+    return mimeTypes[idx];
+  }
+
+  std::string FileImpl::getChecksum()
+  {
+    if (!header.hasChecksum())
+      return std::string();
+
+    std::shared_ptr<const Buffer> chksum;
+    try {
+      chksum = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16));
+    } catch (...)
+    {
+      log_warn("error reading checksum");
+      return std::string();
+    }
+
+    char hexdigest[33];
+    hexdigest[32] = '\0';
+    static const char hex[] = "0123456789abcdef";
+    char* p = hexdigest;
+    for (int i = 0; i < 16; ++i)
+    {
+      uint8_t v = chksum->at(offset_t(i));
+      *p++ = hex[v >> 4];
+      *p++ = hex[v & 0xf];
+    }
+    log_debug("chksum=" << hexdigest);
+    return hexdigest;
+  }
+
+  bool FileImpl::verify()
+  {
+    if (!header.hasChecksum())
+      return false;
+
+    struct zim_MD5_CTX md5ctx;
+    zim_MD5Init(&md5ctx);
+
+    offset_type checksumPos = header.getChecksumPos();
+    offset_type currentPos = 0;
+    for(auto part = zimFile->begin();
+        part != zimFile->end();
+        part++) {
+      std::ifstream stream(part->second->filename());
+      char ch;
+      for(/*NOTHING*/ ; currentPos < checksumPos && stream.get(ch).good(); currentPos++) {
+        zim_MD5Update(&md5ctx, reinterpret_cast<const uint8_t*>(&ch), 1);
+      }
+      if (stream.bad()) {
+        perror("error while reading file");
+        return false;
+      }
+      if (currentPos == checksumPos) {
+        break;
+      }
+    }
+
+    if (currentPos != checksumPos) {
+      return false;
+    }
+
+    unsigned char chksumCalc[16];
+    auto chksumFile = zimReader->get_buffer(offset_t(header.getChecksumPos()), zsize_t(16));
+
+    zim_MD5Final(chksumCalc, &md5ctx);
+    if (std::memcmp(chksumFile->data(), chksumCalc, 16) != 0)
+    {
+      return false;
+    }
+
+    return true;
+  }
+
+  time_t FileImpl::getMTime() const {
+    return zimFile->getMTime();
+  }
+
+  zim::zsize_t FileImpl::getFilesize() const {
+    return zimFile->fsize();
+  }
+
+  bool FileImpl::is_multiPart() const {
+    return zimFile->is_multiPart();
+  }
+}
diff --git a/src/fileimpl.h b/src/fileimpl.h

new file mode 100644 (file)

index 0000000..ecddadb
--- /dev/null
+++ b/src/fileimpl.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FILEIMPL_H
+#define ZIM_FILEIMPL_H
+
+#include <string>
+#include <vector>
+#include <map>
+#include <memory>
+#include <pthread.h>
+#include <zim/zim.h>
+#include <zim/fileheader.h>
+#include <mutex>
+#include "cache.h"
+#include "_dirent.h"
+#include "cluster.h"
+#include "buffer.h"
+#include "file_reader.h"
+#include "file_compound.h"
+#include "zim_types.h"
+
+namespace zim
+{
+  class FileImpl
+  {
+      std::shared_ptr<FileCompound> zimFile;
+      std::shared_ptr<FileReader> zimReader;
+      std::vector<char> bufferDirentZone;
+      pthread_mutex_t bufferDirentLock;
+      Fileheader header;
+      std::string filename;
+
+      std::unique_ptr<const Reader> titleIndexReader;
+      std::unique_ptr<const Reader> urlPtrOffsetReader;
+      std::unique_ptr<const Reader> clusterOffsetReader;
+
+      offset_t getOffset(const Reader* reader, size_t idx);
+
+      Cache<article_index_t, std::shared_ptr<const Dirent>> direntCache;
+      pthread_mutex_t direntCacheLock;
+
+      Cache<cluster_index_t, std::shared_ptr<Cluster>> clusterCache;
+      pthread_mutex_t clusterCacheLock;
+
+      bool cacheUncompressedCluster;
+      typedef std::map<char, article_index_t> NamespaceCache;
+
+      NamespaceCache namespaceBeginCache;
+      pthread_mutex_t namespaceBeginLock;
+      NamespaceCache namespaceEndCache;
+      pthread_mutex_t namespaceEndLock;
+
+      typedef std::vector<std::string> MimeTypes;
+      MimeTypes mimeTypes;
+
+      using pair_type = std::pair<cluster_index_type, article_index_type>;
+      std::vector<pair_type> articleListByCluster;
+      std::once_flag orderOnceFlag;
+
+    public:
+      explicit FileImpl(const std::string& fname);
+
+      time_t getMTime() const;
+
+      const std::string& getFilename() const   { return filename; }
+      const Fileheader& getFileheader() const  { return header; }
+      zsize_t getFilesize() const;
+
+      std::pair<FileCompound::const_iterator, FileCompound::const_iterator>
+      getFileParts(offset_t offset, zsize_t size);
+      std::shared_ptr<const Dirent> getDirent(article_index_t idx);
+      std::shared_ptr<const Dirent> getDirentByTitle(article_index_t idx);
+      article_index_t getIndexByTitle(article_index_t idx);
+      article_index_t getCountArticles() const { return article_index_t(header.getArticleCount()); }
+
+
+      std::pair<bool, article_index_t> findx(char ns, const std::string& url);
+      std::pair<bool, article_index_t> findx(const std::string& url);
+      std::pair<bool, article_index_t> findxByTitle(char ns, const std::string& title);
+      std::pair<bool, article_index_t> findxByClusterOrder(article_index_type idx);
+
+      std::shared_ptr<const Cluster> getCluster(cluster_index_t idx);
+      cluster_index_t getCountClusters() const       { return cluster_index_t(header.getClusterCount()); }
+      offset_t getClusterOffset(cluster_index_t idx);
+      offset_t getBlobOffset(cluster_index_t clusterIdx, blob_index_t blobIdx);
+
+      article_index_t getNamespaceBeginOffset(char ch);
+      article_index_t getNamespaceEndOffset(char ch);
+      article_index_t getNamespaceCount(char ns)
+        { return getNamespaceEndOffset(ns) - getNamespaceBeginOffset(ns); }
+
+      std::string getNamespaces();
+      bool hasNamespace(char ch) const;
+
+      const std::string& getMimeType(uint16_t idx) const;
+
+      std::string getChecksum();
+      bool verify();
+      bool is_multiPart() const;
+  };
+
+}
+
+#endif // ZIM_FILEIMPL_H
+
diff --git a/src/fs.h b/src/fs.h

new file mode 100644 (file)

index 0000000..5736a5e
--- /dev/null
+++ b/src/fs.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FS_H_
+#define ZIM_FS_H_
+
+#ifdef _WIN32
+# include "fs_windows.h"
+#else
+# include "fs_unix.h"
+#endif
+
+namespace zim {
+
+#ifdef _WIN32
+using DEFAULTFS = windows::FS;
+#else
+using DEFAULTFS = unix::FS;
+#endif
+};
+
+#endif //ZIM_FS_H_
diff --git a/src/fs_unix.cpp b/src/fs_unix.cpp

new file mode 100644 (file)

index 0000000..145dbc0
--- /dev/null
+++ b/src/fs_unix.cpp
@@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "fs_unix.h"
+#include <stdexcept>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <errno.h>
+
+namespace zim
+{
+
+namespace unix {
+
+zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const
+{
+#if defined(__APPLE__) || defined(__OpenBSD__)
+# define PREAD pread
+#else
+# define PREAD pread64
+#endif
+  ssize_t full_size_read = 0;
+  auto size_to_read = size.v;
+  auto current_offset = offset.v;
+  errno = 0;
+  while (size_to_read > 0) {
+    auto size_read = PREAD(m_fd, dest, size_to_read, current_offset);
+    if (size_read == -1) {
+      return zsize_t(-1);
+    }
+    size_to_read -= size_read;
+    current_offset += size_read;
+    full_size_read += size_read;
+  }
+  return zsize_t(full_size_read);
+#undef PREAD
+}
+
+zsize_t FD::getSize() const
+{
+  struct stat sb;
+  fstat(m_fd, &sb);
+  return zsize_t(sb.st_size);
+}
+
+bool FD::seek(offset_t offset)
+{
+    return static_cast<int64_t>(offset.v) == lseek(m_fd, offset.v, SEEK_SET);
+}
+
+bool FD::close() {
+  if (m_fd != -1) {
+    return ::close(m_fd);
+  }
+  return -1;
+}
+
+FD FS::openFile(path_t filepath)
+{
+  int fd = open(filepath.c_str(), O_RDONLY);
+  if (fd == -1) {
+    throw std::runtime_error("");
+  }
+  return FD(fd);
+}
+
+bool FS::makeDirectory(path_t path)
+{
+  return !mkdir(path.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
+}
+
+void FS::rename(path_t old_path, path_t new_path)
+{
+  ::rename(old_path.c_str(), new_path.c_str());
+}
+
+std::string FS::join(path_t base, path_t name)
+{
+  return base + "/" + name;
+}
+
+bool FS::remove(path_t path)
+{
+  DIR* dir;
+  /* It's a directory, remove all its entries first */
+  if ((dir = opendir(path.c_str())) != NULL) {
+    struct dirent* ent;
+    while ((ent = readdir(dir)) != NULL) {
+      std::string childName = ent->d_name;
+      if (childName !=  "." && childName != "..") {
+        auto childPath = join(path, childName);
+        remove(childPath);
+      }
+    }
+    closedir(dir);
+    return removeDir(path);
+  }
+
+  /* It's a file */
+  else {
+    return removeFile(path);
+  }
+}
+
+bool FS::removeDir(path_t path) {
+  return rmdir(path.c_str());
+}
+
+bool FS::removeFile(path_t path) {
+  return ::remove(path.c_str());
+}
+
+
+}; // unix namespace
+
+}; // zim namespace
+
diff --git a/src/fs_unix.h b/src/fs_unix.h

new file mode 100644 (file)

index 0000000..1e79e9e
--- /dev/null
+++ b/src/fs_unix.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FS_UNIX_H_
+#define ZIM_FS_UNIX_H_
+
+#include "zim_types.h"
+
+#include <stdexcept>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <dirent.h>
+
+namespace zim {
+
+namespace unix {
+
+using path_t = const std::string&;
+
+class FD {
+  public:
+    using fd_t = int;
+
+  private:
+    fd_t m_fd = -1;
+
+  public:
+    FD() = default;
+    FD(fd_t fd):
+      m_fd(fd) {};
+    FD(const FD& o) = delete;
+    FD(FD&& o) :
+      m_fd(o.m_fd) { o.m_fd = -1; }
+    FD& operator=(FD&& o) {
+      m_fd = o.m_fd;
+      o.m_fd = -1;
+      return *this;
+    }
+    ~FD() { close(); }
+    zsize_t readAt(char* dest, zsize_t size, offset_t offset) const;
+    zsize_t getSize() const;
+    fd_t    getNativeHandle() const
+    {
+        return m_fd;
+    }
+    fd_t    release()
+    {
+        int ret = m_fd;
+        m_fd = -1;
+        return ret;
+    }
+    bool    seek(offset_t offset);
+    bool    close();
+};
+
+struct FS {
+    using FD = zim::unix::FD;
+    static std::string join(path_t base, path_t name);
+    static FD    openFile(path_t filepath);
+    static bool  makeDirectory(path_t path);
+    static void  rename(path_t old_path, path_t new_path);
+    static bool  remove(path_t path);
+    static bool  removeDir(path_t path);
+    static bool  removeFile(path_t path);
+};
+
+}; // unix namespace
+
+}; // zim namespace
+
+#endif //ZIM_FS_UNIX_H_
diff --git a/src/fs_windows.cpp b/src/fs_windows.cpp

new file mode 100644 (file)

index 0000000..e4df1e4
--- /dev/null
+++ b/src/fs_windows.cpp
@@ -0,0 +1,199 @@
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "fs_windows.h"
+#include <stdexcept>
+
+#include <windows.h>
+#include <winbase.h>
+#include <synchapi.h>
+#include <io.h>
+#include <fileapi.h>
+
+#include <iostream>
+#include <sstream>
+
+namespace zim {
+
+namespace windows {
+
+struct ImplFD {
+  HANDLE m_handle = INVALID_HANDLE_VALUE;
+  CRITICAL_SECTION m_criticalSection;
+
+  ImplFD() {
+    InitializeCriticalSection(&m_criticalSection);
+  }
+  ImplFD(HANDLE handle) :
+    m_handle(handle)
+  {
+    InitializeCriticalSection(&m_criticalSection);
+  }
+
+  ~ImplFD() {
+    DeleteCriticalSection(&m_criticalSection);
+  }
+};
+
+FD::FD() :
+  mp_impl(new ImplFD()) {}
+
+FD::FD(fd_t handle) :
+  mp_impl(new ImplFD(handle)) {}
+
+FD::FD(int fd):
+  mp_impl(new ImplFD(reinterpret_cast<HANDLE>(_get_osfhandle(fd)))) {}
+
+FD::FD(FD&& o) = default;
+FD& FD::operator=(FD&& o) = default;
+
+FD::~FD()
+{
+  if (mp_impl)
+    close();
+}
+
+zsize_t FD::readAt(char* dest, zsize_t size, offset_t offset) const
+{
+  if (!mp_impl)
+    return zsize_t(-1);
+  EnterCriticalSection(&mp_impl->m_criticalSection);
+  LARGE_INTEGER off;
+  off.QuadPart = offset.v;
+  if (!SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN)) {
+    goto err;
+  }
+
+  DWORD size_read;
+  if (!ReadFile(mp_impl->m_handle, dest, size.v, &size_read, NULL)) {
+    goto err;
+  }
+  if (size_read != size.v) {
+    goto err;
+  }
+  LeaveCriticalSection(&mp_impl->m_criticalSection);
+  return size;
+err:
+  LeaveCriticalSection(&mp_impl->m_criticalSection);
+  return zsize_t(-1);
+}
+
+bool FD::seek(offset_t offset)
+{
+  if(!mp_impl)
+    return false;
+  LARGE_INTEGER off;
+  off.QuadPart = offset.v;
+  return SetFilePointerEx(mp_impl->m_handle, off, NULL, FILE_BEGIN);
+}
+
+zsize_t FD::getSize() const
+{
+  if(!mp_impl)
+    return zsize_t(0);
+  LARGE_INTEGER size;
+  if (!GetFileSizeEx(mp_impl->m_handle, &size)) {
+    size.QuadPart = 0;
+  }
+  return zsize_t(size.QuadPart);
+}
+
+int FD::release()
+{
+  if(!mp_impl)
+    return -1;
+  int ret = _open_osfhandle(reinterpret_cast<intptr_t>(mp_impl->m_handle), 0);
+  mp_impl->m_handle = INVALID_HANDLE_VALUE;
+  return ret;
+}
+
+bool FD::close()
+{
+  if (!mp_impl || mp_impl->m_handle == INVALID_HANDLE_VALUE) {
+    return false;
+  }
+  return CloseHandle(mp_impl->m_handle);
+}
+
+std::unique_ptr<wchar_t[]> FS::toWideChar(path_t path)
+{
+  auto size = MultiByteToWideChar(CP_UTF8, 0,
+                path.c_str(), -1, nullptr, 0);
+  auto wdata = std::unique_ptr<wchar_t[]>(new wchar_t[size]);
+  auto ret = MultiByteToWideChar(CP_UTF8, 0,
+                path.c_str(), -1, wdata.get(), size);
+  if (0 == ret) {
+    std::ostringstream oss;
+    oss << "Cannot convert path to wchar : " << GetLastError();
+    throw std::runtime_error(oss.str());
+  }
+  return wdata;
+}
+
+FD FS::openFile(path_t filepath)
+{
+  auto wpath = toWideChar(filepath);
+  FD::fd_t handle;
+  handle = CreateFileW(wpath.get(),
+             GENERIC_READ,
+             FILE_SHARE_READ,
+             NULL,
+             OPEN_EXISTING,
+             FILE_ATTRIBUTE_READONLY|FILE_FLAG_RANDOM_ACCESS,
+             NULL);
+  if (handle == INVALID_HANDLE_VALUE) {
+    std::ostringstream oss;
+    oss << "Cannot open file : " << GetLastError();
+    throw std::runtime_error(oss.str());
+  }
+  return FD(handle);
+}
+
+bool FS::makeDirectory(path_t path)
+{
+  auto wpath = toWideChar(path);
+  auto ret = CreateDirectoryW(wpath.get(), NULL);
+  return ret;
+}
+
+
+void FS::rename(path_t old_path, path_t new_path)
+{
+  MoveFileW(toWideChar(old_path).get(), toWideChar(new_path).get());
+}
+
+std::string FS::join(path_t base, path_t name)
+{
+  return base + "\\" + name;
+}
+
+bool FS::removeDir(path_t path)
+{
+  return RemoveDirectoryW(toWideChar(path).get());
+}
+
+bool FS::removeFile(path_t path)
+{
+  return DeleteFileW(toWideChar(path).get());
+}
+
+}; // windows namespace
+
+}; // zim namespace
+
diff --git a/src/fs_windows.h b/src/fs_windows.h

new file mode 100644 (file)

index 0000000..60d1062
--- /dev/null
+++ b/src/fs_windows.h
@@ -0,0 +1,76 @@
+/*
+ * Copyright (C) 2018 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_FS_WINDOWS_H_
+#define ZIM_FS_WINDOWS_H_
+
+#include "zim_types.h"
+
+#include <stdexcept>
+#include <memory>
+
+typedef void* HANDLE;
+
+namespace zim {
+
+namespace windows {
+
+using path_t = const std::string&;
+
+struct ImplFD;
+
+class FD {
+  public:
+    typedef HANDLE fd_t;
+  private:
+    std::unique_ptr<ImplFD> mp_impl;
+
+  public:
+    FD();
+    FD(fd_t handle);
+    FD(int fd);
+    FD(const FD& o) = delete;
+    FD(FD&& o);
+    FD& operator=(FD&& o);
+    FD& operator=(const FD& o) = delete;
+    ~FD();
+    zsize_t readAt(char* dest, zsize_t size, offset_t offset) const;
+    zsize_t getSize() const;
+    int     release();
+    bool    seek(offset_t offset);
+    bool    close();
+};
+
+struct FS {
+    using FD = zim::windows::FD;
+    static std::string join(path_t base, path_t name);
+    static std::unique_ptr<wchar_t[]> toWideChar(path_t path);
+    static FD   openFile(path_t filepath);
+    static bool makeDirectory(path_t path);
+    static void rename(path_t old_path, path_t new_path);
+    static bool remove(path_t path);
+    static bool removeDir(path_t path);
+    static bool removeFile(path_t path);
+};
+
+}; // windows namespace
+
+}; // zim namespace
+
+#endif //ZIM_FS_WINDOWS_H_
diff --git a/src/levenshtein.cpp b/src/levenshtein.cpp

new file mode 100644 (file)

index 0000000..a520c01
--- /dev/null
+++ b/src/levenshtein.cpp
@@ -0,0 +1,31 @@
+
+#include "levenshtein.h"
+#include <numeric>
+#include <algorithm>
+
+int levenshtein_distance(const std::string &s1, const std::string &s2)
+{
+  int s1len = s1.size();
+  int s2len = s2.size();
+       
+  auto column_start = (decltype(s1len))1;
+
+  auto column = new decltype(s1len)[s1len + 1];
+  std::iota(column + column_start - 1, column + s1len + 1, column_start - 1);
+       
+  for (auto x = column_start; x <= s2len; x++) {
+    column[0] = x;
+    auto last_diagonal = x - column_start;
+    for (auto y = column_start; y <= s1len; y++) {
+      auto old_diagonal = column[y];
+      auto v1 = column[y] + 1;
+      auto v2 = column[y - 1] + 1;
+      auto v3 = last_diagonal + (s1[y - 1] == s2[x - 1]? 0 : 1);
+      column[y] = v1<v2 ? (v1<v3 ? v1 : v3) : ( v2<v3 ? v2 : v3);
+      last_diagonal = old_diagonal;
+    }
+  }
+  auto result = column[s1len];
+  delete[] column;
+  return result;
+}
diff --git a/src/levenshtein.h b/src/levenshtein.h

new file mode 100644 (file)

index 0000000..d634693
--- /dev/null
+++ b/src/levenshtein.h
@@ -0,0 +1,9 @@
+
+#ifndef LEVENSHTEIN_H
+#define LEVENSHTEIN_H
+
+#include <string>
+
+int levenshtein_distance(const std::string &s1, const std::string &s2);
+
+#endif // LEVENSHTEIN_H
diff --git a/src/log.h b/src/log.h

new file mode 100644 (file)

index 0000000..5fbd81a
--- /dev/null
+++ b/src/log.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "config.h"
+
+#ifdef WITH_CXXTOOLS
+
+#include <cxxtools/log.h>
+
+#else
+
+#define log_define(e)
+#define log_fatal(e)
+#define log_error(e)
+#define log_warn(e)
+#define log_info(e)
+#define log_debug(e)
+#define log_trace(e)
+#define log_init()
+
+#endif
diff --git a/src/md5.c b/src/md5.c

new file mode 100644 (file)

index 0000000..bae002e
--- /dev/null
+++ b/src/md5.c
@@ -0,0 +1,340 @@
+/* MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ */
+
+/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+rights reserved.
+
+License to copy and use this software is granted provided that it
+is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+Algorithm" in all material mentioning or referencing this software
+or this function.
+
+License is also granted to make and use derivative works provided
+that such works are identified as "derived from the RSA Data
+Security, Inc. MD5 Message-Digest Algorithm" in all material
+mentioning or referencing the derived work.
+
+RSA Data Security, Inc. makes no representations concerning either
+the merchantability of this software or the suitability of this
+software for any particular purpose. It is provided "as is"
+without express or implied warranty of any kind.
+
+These notices must be retained in any copies of any part of this
+documentation and/or software.
+ */
+
+#include "md5.h"
+#include <string.h>
+
+#define MD5_CTX struct zim_MD5_CTX
+
+/* Constants for MD5Transform routine.
+ */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+
+static void MD5Transform PROTO_LIST ((UINT4 [4], const unsigned char [64]));
+static void Encode PROTO_LIST
+  ((unsigned char *, UINT4 *, unsigned int));
+static void Decode PROTO_LIST
+  ((UINT4 *, const unsigned char *, unsigned int));
+/*
+static void MD5_memcpy PROTO_LIST ((POINTER, POINTER, unsigned int));
+static void MD5_memset PROTO_LIST ((POINTER, int, unsigned int));
+*/
+#define MD5_memcpy memcpy
+#define MD5_memset memset
+
+static unsigned char PADDING[64] = {
+  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions.
+ */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits.
+ */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/* FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+  }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+  }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+  }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+  }
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context.
+ */
+void zim_MD5Init (MD5_CTX* context)
+{
+  context->count[0] = context->count[1] = 0;
+  /* Load magic initialization constants.
+*/
+  context->state[0] = 0x67452301;
+  context->state[1] = 0xefcdab89;
+  context->state[2] = 0x98badcfe;
+  context->state[3] = 0x10325476;
+}
+
+/* MD5 block update operation. Continues an MD5 message-digest
+  operation, processing another message block, and updating the
+  context.
+ */
+void zim_MD5Update (
+MD5_CTX *context,
+const unsigned char *input,                          /* input block */
+unsigned int inputLen)                     /* length of input block */
+{
+  unsigned int i, index, partLen;
+
+  /* Compute number of bytes mod 64 */
+  index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+  /* Update number of bits */
+  if ((context->count[0] += ((UINT4)inputLen << 3))
+   < ((UINT4)inputLen << 3))
+ context->count[1]++;
+  context->count[1] += ((UINT4)inputLen >> 29);
+
+  partLen = 64 - index;
+
+  /* Transform as many times as possible.
+*/
+  if (inputLen >= partLen) {
+ MD5_memcpy
+   ((POINTER)&context->buffer[index], (POINTER)input, partLen);
+ MD5Transform (context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+   MD5Transform (context->state, &input[i]);
+
+ index = 0;
+  }
+  else
+ i = 0;
+
+  /* Buffer remaining input */
+  MD5_memcpy
+ ((POINTER)&context->buffer[index], (POINTER)&input[i],
+  inputLen-i);
+}
+
+/* MD5 finalization. Ends an MD5 message-digest operation, writing the
+  the message digest and zeroizing the context.
+ */
+void zim_MD5Final (
+unsigned char digest[16],                         /* message digest */
+MD5_CTX *context)                                       /* context */
+{
+  unsigned char bits[8];
+  unsigned int index, padLen;
+
+  /* Save number of bits */
+  Encode (bits, context->count, 8);
+
+  /* Pad out to 56 mod 64.
+*/
+  index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+  padLen = (index < 56) ? (56 - index) : (120 - index);
+  zim_MD5Update (context, PADDING, padLen);
+
+  /* Append length (before padding) */
+  zim_MD5Update (context, bits, 8);
+  /* Store state in digest */
+  Encode (digest, context->state, 16);
+
+  /* Zeroize sensitive information.
+*/
+  MD5_memset ((POINTER)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block.
+ */
+static void MD5Transform (
+UINT4 state[4],
+const unsigned char block[64])
+{
+  UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+  Decode (x, block, 64);
+
+  /* Round 1 */
+  FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+  FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+  FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+  FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+  FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+  FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+  FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+  FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+  FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+  FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+  FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+  FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+  FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+  FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+  FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+  FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+ /* Round 2 */
+  GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+  GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+  GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+  GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+  GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+  GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+  GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+  GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+  GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+  GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+  GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+  GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+  GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+  GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+  GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+  GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+  /* Round 3 */
+  HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+  HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+  HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+  HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+  HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+  HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+  HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+  HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+  HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+  HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+  HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+  HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+  HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+  HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+  HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+  HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+  /* Round 4 */
+  II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+  II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+  II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+  II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+  II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+  II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+  II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+  II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+  II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+  II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+  II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+  II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+  II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+  II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+  II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+  II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+  state[0] += a;
+  state[1] += b;
+  state[2] += c;
+  state[3] += d;
+
+  /* Zeroize sensitive information.
+*/
+  MD5_memset ((POINTER)x, 0, sizeof (x));
+}
+
+/* Encodes input (UINT4) into output (unsigned char). Assumes len is
+  a multiple of 4.
+ */
+static void Encode (
+unsigned char *output,
+UINT4 *input,
+unsigned int len)
+{
+  unsigned int i, j;
+
+  for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+  }
+}
+
+/* Decodes input (unsigned char) into output (UINT4). Assumes len is
+  a multiple of 4.
+ */
+static void Decode (
+UINT4 *output,
+const unsigned char *input,
+unsigned int len)
+{
+  unsigned int i, j;
+
+  for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) |
+   (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24);
+}
+
+#if 0
+/* Note: Replace "for loop" with standard memcpy if possible.
+ */
+
+static void MD5_memcpy (
+POINTER output,
+POINTER input,
+unsigned int len)
+{
+  unsigned int i;
+
+  for (i = 0; i < len; i++)
+ output[i] = input[i];
+}
+
+/* Note: Replace "for loop" with standard memset if possible.
+ */
+static void MD5_memset (
+POINTER output,
+int value,
+unsigned int len)
+{
+  unsigned int i;
+
+  for (i = 0; i < len; i++)
+ ((char *)output)[i] = (char)value;
+}
+#endif
diff --git a/src/md5.h b/src/md5.h

new file mode 100644 (file)

index 0000000..29bdc39
--- /dev/null
+++ b/src/md5.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2003 Tommi Maekitalo
+ * 
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ * 
+ * As a special exception, you may use this file as part of a free
+ * software library without restriction. Specifically, if other files
+ * instantiate templates or use macros or inline functions from this
+ * file, or you compile this file and link it with other files to
+ * produce an executable, this file does not by itself cause the
+ * resulting executable to be covered by the GNU General Public
+ * License. This exception does not however invalidate any other
+ * reasons why the executable file might be covered by the GNU Library
+ * General Public License.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ * 
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+rights reserved.
+
+License to copy and use this software is granted provided that it
+is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+Algorithm" in all material mentioning or referencing this software
+or this function.
+
+License is also granted to make and use derivative works provided
+that such works are identified as "derived from the RSA Data
+Security, Inc. MD5 Message-Digest Algorithm" in all material
+mentioning or referencing the derived work.
+
+RSA Data Security, Inc. makes no representations concerning either
+the merchantability of this software or the suitability of this
+software for any particular purpose. It is provided "as is"
+without express or implied warranty of any kind.
+
+These notices must be retained in any copies of any part of this
+documentation and/or software.
+ */
+
+/* RSAREF types and constants
+ */
+
+/* PROTOTYPES should be set to one if and only if the compiler supports
+  function argument prototyping.
+The following makes PROTOTYPES default to 0 if it has not already
+  been defined with C compiler flags.
+ */
+
+#ifndef ZIM_MD5_H
+#define ZIM_MD5_H
+
+#ifndef PROTOTYPES
+#define PROTOTYPES 1
+#endif
+
+/* POINTER defines a generic pointer type */
+typedef unsigned char *POINTER;
+
+/* UINT2 defines a two byte word */
+typedef unsigned short int UINT2;
+
+/* UINT4 defines a four byte word */
+typedef unsigned int UINT4;
+
+/* PROTO_LIST is defined depending on how PROTOTYPES is defined above.
+   If using PROTOTYPES, then PROTO_LIST returns the list, otherwise it
+  returns an empty list.
+ */
+
+#if PROTOTYPES
+#define PROTO_LIST(list) list
+#else
+#define PROTO_LIST(list) ()
+#endif
+
+/* MD5 context. */
+struct zim_MD5_CTX {
+  UINT4 state[4];                                   /* state (ABCD) */
+  UINT4 count[2];        /* number of bits, modulo 2^64 (lsb first) */
+  unsigned char buffer[64];                         /* input buffer */
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void zim_MD5Init PROTO_LIST ((struct zim_MD5_CTX *));
+void zim_MD5Update PROTO_LIST
+  ((struct zim_MD5_CTX *, const unsigned char *, unsigned int));
+void zim_MD5Final PROTO_LIST ((unsigned char [16], struct zim_MD5_CTX *));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ZIM_MD5_H */
diff --git a/src/meson.build b/src/meson.build

new file mode 100644 (file)

index 0000000..4814914
--- /dev/null
+++ b/src/meson.build
@@ -0,0 +1,74 @@
+
+configure_file(output : 'config.h',
+               configuration : conf,
+               input : 'config.h.in')
+
+src_directory = include_directories('.')
+
+common_sources = [
+#    'config.h',
+    'article.cpp',
+    'cluster.cpp',
+    'dirent.cpp',
+    'envvalue.cpp',
+    'file.cpp',
+    'fileheader.cpp',
+    'fileimpl.cpp',
+    'file_compound.cpp',
+    'file_reader.cpp',
+    'blob.cpp',
+    'buffer.cpp',
+    'md5.c',
+    'search.cpp',
+    'search_iterator.cpp',
+    'template.cpp',
+    'uuid.cpp',
+    'levenshtein.cpp',
+    'tools.cpp',
+    'compression.cpp',
+    'writer/creator.cpp',
+    'writer/article.cpp',
+    'writer/cluster.cpp',
+    'writer/dirent.cpp',
+    'writer/workers.cpp',
+    'writer/xapianIndexer.cpp'
+]
+
+if host_machine.system() == 'windows'
+    common_sources += 'fs_windows.cpp'
+else
+    common_sources += 'fs_unix.cpp'
+endif
+
+xapian_sources = [
+    'xapian/htmlparse.cc',
+    'xapian/myhtmlparse.cc'
+]
+
+sources = common_sources
+deps = [thread_dep, lzma_dep]
+
+if zlib_dep.found()
+    deps += [zlib_dep]
+endif
+
+if zstd_dep.found()
+    deps += [zstd_dep]
+endif
+
+if xapian_dep.found()
+    sources += xapian_sources
+    sources += lib_resources
+    deps += [xapian_dep, icu_dep]
+endif
+
+libzim = library('zim',
+                 sources,
+                 include_directories : inc,
+                 dependencies : deps,
+                 link_args : extra_link_args,
+                 cpp_args : extra_cpp_args,
+                 version: meson.project_version(),
+                 install : true)
+libzim_dep = declare_dependency(link_with: libzim,
+                                include_directories: include_directory)
diff --git a/src/search.cpp b/src/search.cpp

new file mode 100644 (file)

index 0000000..a1296a3
--- /dev/null
+++ b/src/search.cpp
@@ -0,0 +1,443 @@
+/*
+ * Copyright (C) 2007 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/search.h>
+#include <zim/file.h>
+#include "search_internal.h"
+#include "levenshtein.h"
+#include "fs.h"
+
+#include <sstream>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#if !defined(_WIN32)
+# include <unistd.h>
+#else
+# include <io.h>
+#endif
+#include <errno.h>
+
+#if defined(ENABLE_XAPIAN)
+#include "xapian.h"
+#include <unicode/locid.h>
+#endif
+
+#define MAX_MATCHES_TO_SORT 10000
+
+namespace zim
+{
+
+#if defined(ENABLE_XAPIAN)
+namespace {
+/* Split string in a token array */
+std::vector<std::string> split(const std::string & str,
+                                const std::string & delims=" *-")
+{
+  std::string::size_type lastPos = str.find_first_not_of(delims, 0);
+  std::string::size_type pos = str.find_first_of(delims, lastPos);
+  std::vector<std::string> tokens;
+
+  while (std::string::npos != pos || std::string::npos != lastPos)
+    {
+      tokens.push_back(str.substr(lastPos, pos - lastPos));
+      lastPos = str.find_first_not_of(delims, pos);
+      pos     = str.find_first_of(delims, lastPos);
+    }
+
+  return tokens;
+}
+
+std::map<std::string, int> read_valuesmap(const std::string &s) {
+    std::map<std::string, int> result;
+    std::vector<std::string> elems = split(s, ";");
+    for(std::vector<std::string>::iterator elem = elems.begin();
+        elem != elems.end();
+        elem++)
+    {
+        std::vector<std::string> tmp_elems = split(*elem, ":");
+        result.insert( std::pair<std::string, int>(tmp_elems[0], atoi(tmp_elems[1].c_str())) );
+    }
+    return result;
+}
+
+
+void
+setup_queryParser(Xapian::QueryParser* queryparser,
+                  Xapian::Database& database,
+                  const std::string& language,
+                  const std::string& stopwords,
+                  bool newSuggestionFormat) {
+    queryparser->set_default_op(Xapian::Query::op::OP_AND);
+    queryparser->set_database(database);
+    if ( ! language.empty() )
+    {
+        /* Build ICU Local object to retrieve ISO-639 language code (from
+           ISO-639-3) */
+        icu::Locale languageLocale(language.c_str());
+
+        /* Configuring language base steemming */
+        try {
+            Xapian::Stem stemmer = Xapian::Stem(languageLocale.getLanguage());
+            queryparser->set_stemmer(stemmer);
+            queryparser->set_stemming_strategy(
+              newSuggestionFormat ? Xapian::QueryParser::STEM_SOME : Xapian::QueryParser::STEM_ALL);
+        } catch (...) {
+            std::cout << "No steemming for language '" << languageLocale.getLanguage() << "'" << std::endl;
+        }
+    }
+
+    if ( ! stopwords.empty() )
+    {
+        std::string stopWord;
+        std::istringstream file(stopwords);
+        Xapian::SimpleStopper* stopper = new Xapian::SimpleStopper();
+        while (std::getline(file, stopWord, '\n')) {
+            stopper->add(stopWord);
+        }
+        stopper->release();
+        queryparser->set_stopper(stopper);
+    }
+}
+
+class LevenshteinDistanceMaker : public Xapian::KeyMaker {
+  public:
+    LevenshteinDistanceMaker(const std::string& query, size_t value_index):
+        query(query),
+        value_index(value_index) {}
+    ~LevenshteinDistanceMaker() = default;
+
+    virtual std::string operator() (const Xapian::Document &doc) const {
+       auto document_value = doc.get_value(value_index);
+       return Xapian::sortable_serialise(
+                  levenshtein_distance(document_value, query));
+    }
+  private:
+    std::string query;
+    size_t value_index;
+};
+
+}
+#endif
+
+Search::Search(const std::vector<const File*> zimfiles) :
+    internal(new InternalData),
+    zimfiles(zimfiles),
+    prefixes(""), query(""),
+    latitude(0), longitude(0), distance(0),
+    range_start(0), range_end(0),
+    suggestion_mode(false),
+    geo_query(false),
+    search_started(false),
+    has_database(false),
+    verbose(false),
+    estimated_matches_number(0)
+{}
+
+Search::Search(const File* zimfile) :
+    internal(new InternalData),
+    prefixes(""), query(""),
+    latitude(0), longitude(0), distance(0),
+    range_start(0), range_end(0),
+    suggestion_mode(false),
+    geo_query(false),
+    search_started(false),
+    has_database(false),
+    verbose(false),
+    estimated_matches_number(0)
+{
+    zimfiles.push_back(zimfile);
+}
+
+Search::Search(const Search& it) :
+     internal(new InternalData),
+     zimfiles(it.zimfiles),
+     prefixes(it.prefixes),
+     query(it.query),
+     latitude(it.latitude), longitude(it.longitude), distance(it.distance),
+     range_start(it.range_start), range_end(it.range_end),
+     suggestion_mode(it.suggestion_mode),
+     geo_query(it.geo_query),
+     search_started(false),
+     has_database(false),
+     verbose(it.verbose),
+     estimated_matches_number(0)
+{ }
+
+Search& Search::operator=(const Search& it)
+{
+     if ( internal ) internal.reset();
+     zimfiles = it.zimfiles;
+     prefixes = it.prefixes;
+     query = it.query;
+     latitude = it.latitude;
+     longitude = it.longitude;
+     distance = it.distance;
+     range_start = it.range_start;
+     range_end = it.range_end;
+     suggestion_mode = it.suggestion_mode;
+     geo_query = it.geo_query;
+     search_started = false;
+     has_database = false;
+     verbose = it.verbose;
+     estimated_matches_number = 0;
+     return *this;
+}
+
+Search::Search(Search&& it) = default;
+Search& Search::operator=(Search&& it) = default;
+Search::~Search() = default;
+
+void Search::set_verbose(bool verbose) {
+    this->verbose = verbose;
+}
+
+Search& Search::add_zimfile(const File* zimfile) {
+    zimfiles.push_back(zimfile);
+    return *this;
+}
+
+Search& Search::set_query(const std::string& query) {
+    this->query = query;
+    return *this;
+}
+
+Search& Search::set_georange(float latitude, float longitude, float distance) {
+    this->latitude = latitude;
+    this->longitude = longitude;
+    this->distance = distance;
+    geo_query = true;
+    return *this;
+}
+
+Search& Search::set_range(int start, int end) {
+    this->range_start = start;
+    this->range_end = end; 
+    return *this;
+}
+
+Search& Search::set_suggestion_mode(const bool suggestion_mode) {
+    this->suggestion_mode = suggestion_mode;
+    return *this;
+}
+
+#define WITH_LEV 1
+
+Search::iterator Search::begin() const {
+#if defined(ENABLE_XAPIAN)
+    if ( this->search_started ) {
+        return new search_iterator::InternalData(this, internal->results.begin());
+    }
+
+    std::vector<const File*>::const_iterator it;
+    bool first = true;
+    bool hasNewSuggestionFormat = false;
+    std::string language;
+    std::string stopwords;
+    for(it=zimfiles.begin(); it!=zimfiles.end(); it++)
+    {
+        const File* zimfile = *it;
+        if (zimfile->is_multiPart()) {
+            continue;
+        }
+        zim::Article xapianArticle;
+        if (suggestion_mode) {
+          xapianArticle = zimfile->getArticle('X', "title/xapian");
+          if (xapianArticle.good()) {
+            hasNewSuggestionFormat = true;
+          }
+        }
+        if (!xapianArticle.good()) {
+          xapianArticle = zimfile->getArticle('X', "fulltext/xapian");
+        }
+        if (!xapianArticle.good()) {
+          xapianArticle = zimfile->getArticle('Z', "/fulltextIndex/xapian");
+        }
+        if (!xapianArticle.good()) {
+            continue;
+        }
+        auto dbOffset = xapianArticle.getOffset();
+        if (dbOffset == 0) {
+            continue;
+        }
+        DEFAULTFS::FD databasefd;
+        try {
+            databasefd = DEFAULTFS::openFile(zimfile->getFilename());
+        } catch (...) {
+            std::cerr << "Impossible to open " << zimfile->getFilename() << std::endl;
+            std::cerr << strerror(errno) << std::endl;
+            continue;
+        }
+        if (!databasefd.seek(offset_t(dbOffset))) {
+            std::cerr << "Something went wrong seeking databasedb "
+                      << zimfile->getFilename() << std::endl;
+            std::cerr << "dbOffest = " << dbOffset << std::endl;
+            continue;
+        }
+        Xapian::Database database;
+        try {
+            database = Xapian::Database(databasefd.release());
+        } catch( Xapian::DatabaseError& e) {
+            std::cerr << "Something went wrong opening xapian database for zimfile "
+                      << zimfile->getFilename() << std::endl;
+            std::cerr << "dbOffest = " << dbOffset << std::endl;
+            std::cerr << "error = " << e.get_msg() << std::endl;
+            continue;
+        }
+
+        if ( first ) {
+            this->valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
+            language = database.get_metadata("language");
+            if (language.empty() ) {
+              // Database created before 2017/03 has no language metadata.
+              // However, term were stemmed anyway and we need to stem our
+              // search query the same the database was created.
+              // So we need a language, let's use the one of the zim.
+              // If zimfile has no language metadata, we can't do lot more here :/
+              auto article = zimfile->getArticle('M', "Language");
+              if ( article.good() ) {
+                language = article.getData();
+              }
+            }
+            stopwords = database.get_metadata("stopwords");
+            this->prefixes = database.get_metadata("prefixes");
+        } else {
+            std::map<std::string, int> valuesmap = read_valuesmap(database.get_metadata("valuesmap"));
+            if (this->valuesmap != valuesmap ) {
+                // [TODO] Ignore the database, raise a error ?
+            }
+        }
+        internal->xapian_databases.push_back(database);
+        internal->database.add_database(database);
+        has_database = true;
+    }
+
+    if ( ! has_database ) {
+        if (verbose) {
+          std::cout << "No database, no result" << std::endl;
+        }
+        estimated_matches_number = 0;
+        return nullptr;
+    }
+
+    Xapian::QueryParser* queryParser = new Xapian::QueryParser();
+    if (verbose) {
+      std::cout << "Setup queryparser using language " << language << std::endl;
+    }
+    setup_queryParser(queryParser, internal->database, language, stopwords, hasNewSuggestionFormat);
+
+    std::string prefix = "";
+    unsigned flags = Xapian::QueryParser::FLAG_DEFAULT;
+    if (suggestion_mode) {
+      if (verbose) {
+        std::cout << "Mark query as 'partial'" << std::endl;
+      }
+      flags |= Xapian::QueryParser::FLAG_PARTIAL;
+      if ( !hasNewSuggestionFormat
+        && this->prefixes.find("S") != std::string::npos ) {
+        if (verbose) {
+          std::cout << "Searching in title namespace" << std::endl;
+        }
+        prefix = "S";
+      }
+    }
+    Xapian::Query query;
+    try {
+      query = queryParser->parse_query(this->query, flags, prefix);
+    } catch (Xapian::QueryParserError& e) {
+      estimated_matches_number = 0;
+      return nullptr;
+    }
+    if (verbose) {
+        std::cout << "Parsed query '" << this->query << "' to " << query.get_description() << std::endl;
+    }
+    delete queryParser;
+
+    Xapian::Enquire enquire(internal->database);
+#if WITH_LEV
+    std::unique_ptr<Xapian::KeyMaker> keyMaker(nullptr);
+#endif
+
+    if (geo_query && valuesmap.find("geo.position") != valuesmap.end()) {
+        Xapian::GreatCircleMetric metric;
+        Xapian::LatLongCoord centre(latitude, longitude);
+        Xapian::LatLongDistancePostingSource ps(valuesmap["geo.position"], centre, metric, distance);
+        if ( this->query.empty()) {
+          query = Xapian::Query(&ps);
+        } else {
+          query = Xapian::Query(Xapian::Query::OP_FILTER, query, Xapian::Query(&ps));
+        }
+    }
+
+    enquire.set_query(query);
+
+#if WITH_LEV
+    if (suggestion_mode && !hasNewSuggestionFormat) {
+      size_t value_index = 0;
+      bool has_custom_distance_maker = true;
+      if ( !valuesmap.empty() ) {
+        if ( valuesmap.find("title") != valuesmap.end() ) {
+          value_index = valuesmap["title"];
+        } else {
+          // This should not happen as valuesmap has a title entry, but let's
+          // be tolerent.
+          has_custom_distance_maker = false;
+        }
+      }
+      auto temp_results = enquire.get_mset(0,0);
+      if ( has_custom_distance_maker
+        && temp_results.get_matches_estimated() <= MAX_MATCHES_TO_SORT ) {
+        keyMaker.reset(new LevenshteinDistanceMaker(this->query, value_index));
+        enquire.set_sort_by_key(keyMaker.get(), false);
+      }
+    }
+#endif
+
+    if (suggestion_mode && valuesmap.find("title") != valuesmap.end()) {
+      enquire.set_collapse_key(valuesmap["title"]);
+    }
+
+    internal->results = enquire.get_mset(this->range_start, this->range_end-this->range_start);
+    search_started = true;
+    estimated_matches_number = internal->results.get_matches_estimated();
+    return new search_iterator::InternalData(this, internal->results.begin());
+#else
+    estimated_matches_number = 0;
+    return nullptr;
+#endif
+}
+
+Search::iterator Search::end() const {
+#if defined(ENABLE_XAPIAN)
+    if ( ! has_database ) {
+        return nullptr;
+    }
+    return new search_iterator::InternalData(this, internal->results.end());
+#else
+    return nullptr;
+#endif
+}
+
+int Search::get_matches_estimated() const {
+    // Ensure that the search as begin
+    begin();
+    return estimated_matches_number;
+}
+
+} //namespace zim
diff --git a/src/search_internal.h b/src/search_internal.h

new file mode 100644 (file)

index 0000000..8781463
--- /dev/null
+++ b/src/search_internal.h
@@ -0,0 +1,97 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_SEARCH_INTERNAL_H
+#define ZIM_SEARCH_INTERNAL_H
+
+#include "config.h"
+
+#if defined(ENABLE_XAPIAN)
+#include <xapian.h>
+#endif
+
+namespace zim {
+
+struct Search::InternalData {
+#if defined(ENABLE_XAPIAN)
+    std::vector<Xapian::Database> xapian_databases;
+    Xapian::Database database;
+    Xapian::MSet results;
+#endif
+};
+
+struct search_iterator::InternalData {
+#if defined(ENABLE_XAPIAN)
+    const Search* search;
+    Xapian::MSetIterator iterator;
+    Xapian::Document _document;
+    bool document_fetched;
+#endif
+    Article _article;
+    bool article_fetched;
+
+
+#if defined(ENABLE_XAPIAN)
+    InternalData(const Search* search, Xapian::MSetIterator iterator) : 
+        search(search),
+        iterator(iterator),
+        document_fetched(false),
+        article_fetched(false)
+    {};
+    
+    Xapian::Document get_document() {
+        if ( !document_fetched ) {
+            if (iterator != search->internal->results.end()) {
+                _document = iterator.get_document();
+            }
+            document_fetched = true;
+        }
+        return _document;
+    }
+#endif
+
+    int get_databasenumber() {
+#if defined(ENABLE_XAPIAN)
+        Xapian::docid docid = *iterator;
+        return (docid - 1) % search->zimfiles.size();
+#endif
+        return 0;
+    }
+
+    Article& get_article() {
+#if defined(ENABLE_XAPIAN)
+        if ( !article_fetched ) {
+            int databasenumber = get_databasenumber();
+            const File* file = search->zimfiles[databasenumber];
+            if ( ! file )
+                _article = Article();
+            else
+                _article = file->getArticleByUrl(get_document().get_data());
+            article_fetched = true;
+        }
+#endif
+        return _article;
+    }
+};
+
+
+
+}; //namespace zim
+
+#endif //ZIM_SEARCH_INTERNAL_H
diff --git a/src/search_iterator.cpp b/src/search_iterator.cpp

new file mode 100644 (file)

index 0000000..c950305
--- /dev/null
+++ b/src/search_iterator.cpp
@@ -0,0 +1,239 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "xapian/myhtmlparse.h"
+#include <zim/search_iterator.h>
+#include <zim/search.h>
+#include <zim/file.h>
+#include "search_internal.h"
+
+namespace zim {
+
+
+search_iterator::~search_iterator() = default;
+search_iterator::search_iterator(search_iterator&& it) = default;
+search_iterator& search_iterator::operator=(search_iterator&& it) = default;
+
+search_iterator::search_iterator() : search_iterator(nullptr)
+{};
+
+search_iterator::search_iterator(InternalData* internal_data)
+  : internal(internal_data)
+{}
+
+search_iterator::search_iterator(const search_iterator& it)
+    : internal(nullptr)
+{
+    if (it.internal) internal = std::unique_ptr<InternalData>(new InternalData(*it.internal));
+}
+
+search_iterator & search_iterator::operator=(const search_iterator& it) {
+    if ( ! it.internal ) internal.reset();
+    else if ( ! internal ) internal = std::unique_ptr<InternalData>(new InternalData(*it.internal));
+    else *internal = *it.internal;
+
+    return *this;
+}
+
+bool search_iterator::operator==(const search_iterator& it) const {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal && ! it.internal)
+        return true;
+    if ( ! internal || ! it.internal)
+        return false;
+    return (internal->search == it.internal->search
+         && internal->iterator == it.internal->iterator);
+#else
+    // If there is no xapian, there is no search. There is only one iterator: end.
+    // So all iterators are equal.
+    return true;
+#endif
+}
+
+bool search_iterator::operator!=(const search_iterator& it) const {
+    return ! (*this == it);
+}
+
+search_iterator& search_iterator::operator++() {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal ) {
+        return *this;
+    }
+    ++(internal->iterator);
+    internal->document_fetched = false;
+    internal->article_fetched = false;
+#endif
+    return *this;
+}
+
+search_iterator search_iterator::operator++(int) {
+    search_iterator it = *this;
+    operator++();
+    return it;
+}
+
+search_iterator& search_iterator::operator--() {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal ) {
+        return *this;
+    }
+    --(internal->iterator);
+    internal->document_fetched = false;
+    internal->article_fetched = false;
+#endif
+    return *this;
+}
+
+search_iterator search_iterator::operator--(int) {
+    search_iterator it = *this;
+    operator--();
+    return it;
+}
+
+std::string search_iterator::get_url() const {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal ) {
+        return "";
+    }
+    return internal->get_document().get_data();
+#else
+    return "";
+#endif
+}
+
+std::string search_iterator::get_title() const {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal ) {
+        return "";
+    }
+    if ( internal->search->valuesmap.empty() )
+    {
+        /* This is the old legacy version. Guess and try */
+        return internal->get_document().get_value(0);
+    }
+    else if ( internal->search->valuesmap.find("title") != internal->search->valuesmap.end() )
+    {
+        return internal->get_document().get_value(internal->search->valuesmap["title"]);
+    }
+#endif
+    return "";
+}
+
+int search_iterator::get_score() const {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal ) {
+        return 0;
+    }
+    return internal->iterator.get_percent();
+#else
+    return 0;
+#endif
+}
+
+std::string search_iterator::get_snippet() const {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal ) {
+        return "";
+    }
+    if ( internal->search->valuesmap.empty() )
+    {
+        /* This is the old legacy version. Guess and try */
+        std::string stored_snippet = internal->get_document().get_value(1);
+        if ( ! stored_snippet.empty() )
+            return stored_snippet;
+        /* Let's continue here, and see if we can genenate one */
+    }
+    else if ( internal->search->valuesmap.find("snippet") != internal->search->valuesmap.end() )
+    {
+        return internal->get_document().get_value(internal->search->valuesmap["snippet"]);
+    }
+    /* No reader, no snippet */
+    Article& article = internal->get_article();
+    if ( ! article.good() )
+        return "";
+    /* Get the content of the article to generate a snippet.
+       We parse it and use the html dump to avoid remove html tags in the
+       content and be able to nicely cut the text at random place. */
+    zim::MyHtmlParser htmlParser;
+    std::string content = article.getData();
+    try {
+        htmlParser.parse_html(content, "UTF-8", true);
+    } catch (...) {}
+    return internal->search->internal->results.snippet(htmlParser.dump, 500);
+#else
+    return "";
+#endif
+}
+
+int search_iterator::get_size() const {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal ) {
+        return -1;
+    }
+    if ( internal->search->valuesmap.empty() )
+    {
+        /* This is the old legacy version. Guess and try */
+        return internal->get_document().get_value(2).empty() == true ? -1 : atoi(internal->get_document().get_value(2).c_str());
+    }
+    else if ( internal->search->valuesmap.find("size") != internal->search->valuesmap.end() )
+    {
+        return atoi(internal->get_document().get_value(internal->search->valuesmap["size"]).c_str());
+    }
+#endif
+    /* The size is never used. Do we really want to get the content and
+       calculate the size ? */
+    return -1;
+}
+
+int search_iterator::get_wordCount() const      {
+#if defined(ENABLE_XAPIAN)
+    if ( ! internal ) {
+        return -1;
+    }
+    if ( internal->search->valuesmap.empty() )
+    {
+        /* This is the old legacy version. Guess and try */
+        return internal->get_document().get_value(3).empty() == true ? -1 : atoi(internal->get_document().get_value(3).c_str());
+    }
+    else if ( internal->search->valuesmap.find("wordcount") != internal->search->valuesmap.end() )
+    {
+        return atoi(internal->get_document().get_value(internal->search->valuesmap["wordcount"]).c_str());
+    }
+#endif
+    return -1;
+}
+
+int search_iterator::get_fileIndex() const {
+#if defined(ENABLE_XAPIAN)
+    if ( internal ) {
+        return internal->get_databasenumber();
+    }
+#endif
+    return 0;
+}
+
+search_iterator::reference search_iterator::operator*() const {
+    return internal->get_article();
+}
+
+search_iterator::pointer search_iterator::operator->() const {
+    return &internal->get_article();
+}
+
+} // namespace zim
diff --git a/src/template.cpp b/src/template.cpp

new file mode 100644 (file)

index 0000000..75e4bb8
--- /dev/null
+++ b/src/template.cpp
@@ -0,0 +1,142 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "template.h"
+
+namespace zim
+{
+  void TemplateParser::state_data(char ch)
+  {
+    data += ch;
+
+    if (ch == '<')
+    {
+      state = &TemplateParser::state_lt;
+      save = data.size() - 1;
+    }
+  }
+
+  void TemplateParser::state_lt(char ch)
+  {
+    data += ch;
+
+    if (ch == '%')
+      state = &TemplateParser::state_token0;
+    else
+      state = &TemplateParser::state_data;
+  }
+
+  void TemplateParser::state_token0(char ch)
+  {
+    data += ch;
+
+    if (ch == '/')
+      state = &TemplateParser::state_link0;
+    else
+    {
+      token = data.size() - 1;
+      state = &TemplateParser::state_token;
+    }
+  }
+
+  void TemplateParser::state_token(char ch)
+  {
+    data += ch;
+
+    if (ch == '%')
+      state = &TemplateParser::state_token_end;
+  }
+
+  void TemplateParser::state_token_end(char ch)
+  {
+    if (ch == '>')
+    {
+      if (event)
+      {
+        event->onData(data.substr(0, save));
+        event->onToken(data.substr(token, data.size() - token - 1));
+        data.clear();
+      }
+
+      state = &TemplateParser::state_data;
+    }
+    else
+    {
+      data += ch;
+      state = &TemplateParser::state_data;
+    }
+  }
+
+  void TemplateParser::state_link0(char ch)
+  {
+    data += ch;
+
+    ns = ch;
+    state = &TemplateParser::state_link;
+  }
+
+  void TemplateParser::state_link(char ch)
+  {
+    data += ch;
+
+    if (ch == '/')
+    {
+      token = data.size();
+      state = &TemplateParser::state_title;
+    }
+    else
+      state = &TemplateParser::state_data;
+  }
+
+  void TemplateParser::state_title(char ch)
+  {
+    data += ch;
+
+    if (ch == '%')
+    {
+      token_e = data.size() - 1;
+      state = &TemplateParser::state_title_end;
+    }
+  }
+
+  void TemplateParser::state_title_end(char ch)
+  {
+    data += ch;
+
+    if (ch == '>')
+    {
+      if (event)
+      {
+        event->onData(data.substr(0, save));
+        event->onLink(ns, data.substr(token, token_e - token));
+      }
+
+      data.clear();
+      state = &TemplateParser::state_data;
+    }
+  }
+
+  void TemplateParser::flush()
+  {
+    if (event)
+      event->onData(data);
+    data.clear();
+    state = &TemplateParser::state_data;
+  }
+}
diff --git a/src/template.h b/src/template.h

new file mode 100644 (file)

index 0000000..116be10
--- /dev/null
+++ b/src/template.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_TEMPLATE_H
+#define ZIM_TEMPLATE_H
+
+#include <string>
+
+namespace zim
+{
+  class TemplateParser
+  {
+    public:
+      class Event
+      {
+        public:
+          virtual void onData(const std::string& data) = 0;
+          virtual void onToken(const std::string& token) = 0;
+          virtual void onLink(char ns, const std::string& url) = 0;
+          virtual ~Event() = default;
+      };
+
+    private:
+      Event* event;
+
+      std::string data;
+      std::string::size_type save;
+      std::string::size_type token;
+      std::string::size_type token_e;
+      char ns;
+      typedef void (TemplateParser::*state_type)(char);
+
+      state_type state;
+
+      void state_data(char ch);
+      void state_lt(char ch);
+      void state_token0(char ch);
+      void state_token(char ch);
+      void state_token_end(char ch);
+      void state_link0(char ch);
+      void state_link(char ch);
+      void state_title(char ch);
+      void state_title_end(char ch);
+
+    public:
+      explicit TemplateParser(Event* ev)
+        : event(ev),
+          state(&TemplateParser::state_data)
+        { }
+
+      void parse(char ch)
+      {
+        (this->*state)(ch);
+      }
+
+      void parse(const std::string& s)
+      {
+        for (std::string::const_iterator ch = s.begin(); ch != s.end(); ++ch)
+          parse(*ch);
+      }
+
+      void flush();
+  };
+}
+
+#endif // ZIM_TEMPLATE_H
diff --git a/src/tools.cpp b/src/tools.cpp

new file mode 100644 (file)

index 0000000..6539d7d
--- /dev/null
+++ b/src/tools.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright 2013-2016 Emmanuel Engelhart <kelson@kiwix.org>
+ * Copyright 2016 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "tools.h"
+
+#include <sys/types.h>
+#include <dirent.h>
+#include <string.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <memory>
+#include <errno.h>
+
+#include <unicode/translit.h>
+#include <unicode/ucnv.h>
+
+#ifdef _WIN32
+# include <windows.h>
+# include <direct.h>
+# include <io.h>
+# include <stringapiset.h>
+# define SEPARATOR "\\"
+#else
+# include <unistd.h>
+# define SEPARATOR "/"
+#endif
+
+#ifdef __MINGW32__
+# include <time.h>
+#else
+# include <thread>
+# include <chrono>
+#endif
+
+
+std::string zim::removeAccents(const std::string& text)
+{
+  ucnv_setDefaultName("UTF-8");
+  static UErrorCode status = U_ZERO_ERROR;
+  static std::unique_ptr<icu::Transliterator> removeAccentsTrans(icu::Transliterator::createInstance(
+      "Lower; NFD; [:M:] remove; NFC", UTRANS_FORWARD, status));
+  icu::UnicodeString ustring(text.c_str());
+  removeAccentsTrans->transliterate(ustring);
+  std::string unaccentedText;
+  ustring.toUTF8String(unaccentedText);
+  return unaccentedText;
+}
+
+
+void zim::microsleep(int microseconds) {
+#ifdef __MINGW32__
+   struct timespec wait = {0, 0};
+   wait.tv_sec = microseconds / 1000000;
+   wait.tv_nsec = (microseconds - wait.tv_sec*10000) * 1000;
+   nanosleep(&wait, nullptr);
+#else
+   std::this_thread::sleep_for(std::chrono::microseconds(microseconds));
+#endif
+}
diff --git a/src/tools.h b/src/tools.h

new file mode 100644 (file)

index 0000000..1a58e7b
--- /dev/null
+++ b/src/tools.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2013-2016 Emmanuel Engelhart <kelson@kiwix.org>
+ * Copyright 2016 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_TOOLS_H
+#define OPENZIM_LIBZIM_TOOLS_H
+
+#include <string>
+
+namespace zim {
+
+  std::string removeAccents(const std::string& text);
+  void microsleep(int microseconds);
+}
+
+#endif  // OPENZIM_LIBZIM_TOOLS_H
diff --git a/src/uuid.cpp b/src/uuid.cpp

new file mode 100644 (file)

index 0000000..80da56b
--- /dev/null
+++ b/src/uuid.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/uuid.h>
+#include <iostream>
+#include <time.h>
+#include <zim/zim.h> // necessary to have the new types
+#include "log.h"
+#include "md5.h"
+
+#ifdef _WIN32
+
+#  include <time.h>
+#  include <windows.h>
+int gettimeofday(struct timeval* tp, void* tzp) {
+    DWORD t;
+    t = timeGetTime();
+    tp->tv_sec = t / 1000;
+    tp->tv_usec = t % 1000;
+    return 0;
+}
+
+#define getpid GetCurrentProcessId
+
+#else
+#  include <sys/time.h>
+#endif
+
+log_define("zim.uuid")
+
+namespace zim
+{
+  namespace
+  {
+    char hex[] = "0123456789abcdef";
+    inline char hi(char v)
+    { return hex[(v >> 4) & 0xf]; }
+
+    inline char lo(char v)
+    { return hex[v & 0xf]; }
+  }
+
+  Uuid Uuid::generate(std::string value)
+  {
+    Uuid ret;
+    struct zim_MD5_CTX md5ctx;
+    zim_MD5Init(&md5ctx);
+
+    if ( value.empty() ) {
+      struct timeval tv;
+      gettimeofday(&tv, 0);
+
+      clock_t c = clock();
+
+      zim_MD5Update(&md5ctx, reinterpret_cast<const uint8_t*>(&c), sizeof(clock_t));
+      zim_MD5Update(&md5ctx, reinterpret_cast<const uint8_t*>(&tv), sizeof(struct timeval));
+    } else {
+      zim_MD5Update(&md5ctx, reinterpret_cast<const uint8_t*>(value.data()), value.size());
+    }
+    zim_MD5Final(reinterpret_cast<uint8_t*>(&ret.data[0]), &md5ctx);
+
+    log_debug("generated uuid: " << ret.data);
+
+    return ret;
+  }
+
+  std::ostream& operator<< (std::ostream& out, const Uuid& uuid)
+  {
+    for (unsigned n = 0; n < 4; ++n)
+      out << hi(uuid.data[n]) << lo(uuid.data[n]);
+    out << '-';
+    for (unsigned n = 4; n < 6; ++n)
+      out << hi(uuid.data[n]) << lo(uuid.data[n]);
+    out << '-';
+    for (unsigned n = 6; n < 8; ++n)
+      out << hi(uuid.data[n]) << lo(uuid.data[n]);
+    out << '-';
+    for (unsigned n = 8; n < 10; ++n)
+      out << hi(uuid.data[n]) << lo(uuid.data[n]);
+    out << '-';
+    for (unsigned n = 10; n < 16; ++n)
+      out << hi(uuid.data[n]) << lo(uuid.data[n]);
+    return out;
+  }
+
+}
diff --git a/src/writer/_dirent.h b/src/writer/_dirent.h

new file mode 100644 (file)

index 0000000..e7338ee
--- /dev/null
+++ b/src/writer/_dirent.h
@@ -0,0 +1,179 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_DIRENT_H
+#define ZIM_WRITER_DIRENT_H
+
+#include "cluster.h"
+
+#include "debug.h"
+
+namespace zim
+{
+  namespace writer {
+    class Dirent;
+    struct DirectInfo {
+      DirectInfo() :
+        clusterNumber(0),
+        blobNumber(0)
+      {};
+      cluster_index_t  clusterNumber;
+      blob_index_t     blobNumber;
+    };
+
+    struct RedirectInfo {
+      const Dirent* redirectDirent = nullptr;
+    };
+
+    union DirentInfo {
+      DirectInfo d;
+      RedirectInfo r;
+    };
+
+    class Dirent
+    {
+        static const uint16_t redirectMimeType = 0xffff;
+        static const uint16_t linktargetMimeType = 0xfffe;
+        static const uint16_t deletedMimeType = 0xfffd;
+        static const uint32_t version = 0;
+
+        uint16_t mimeType;
+        DirentInfo info {};
+        Url url;
+        std::string title;
+        Cluster* cluster = nullptr;
+        Url redirectUrl;
+        article_index_t idx = article_index_t(0);
+        offset_t offset;
+
+      public:
+        Dirent()
+          : mimeType(0),
+            url(),
+            title(),
+            redirectUrl()
+        {
+          info.d.clusterNumber = cluster_index_t(0);
+          info.d.blobNumber = blob_index_t(0);
+        }
+
+        explicit Dirent(Url url_ )
+          : Dirent()
+          { url = url_; }
+
+        char getNamespace() const               { return url.getNs(); }
+        const std::string& getTitle() const     { return title.empty() ? url.getUrl() : title; }
+        void setTitle(const std::string& title_) { title = title_; }
+        const std::string& getUrl() const       { return url.getUrl(); }
+        const Url& getFullUrl() const { return url; }
+        void setUrl(Url url_) {
+          url = url_;
+        }
+
+        uint32_t getVersion() const            { return version; }
+
+        void setRedirectUrl(Url redirectUrl_)     { redirectUrl = redirectUrl_; }
+        const Url& getRedirectUrl() const         { return redirectUrl; }
+        void setRedirect(const Dirent* target) {
+          info.r.redirectDirent = target;
+          mimeType = redirectMimeType;
+        }
+        article_index_t getRedirectIndex() const      { return isRedirect() ? info.r.redirectDirent->getIdx() : article_index_t(0); }
+
+        void setMimeType(uint16_t mime)
+        {
+          mimeType = mime;
+        }
+
+        void setLinktarget()
+        {
+          ASSERT(mimeType, ==, 0);
+          mimeType = linktargetMimeType;
+        }
+
+        void setDeleted()
+        {
+          ASSERT(mimeType, ==, 0);
+          mimeType = deletedMimeType;
+        }
+
+
+        void setIdx(article_index_t idx_)      { idx = idx_; }
+        article_index_t getIdx() const         { return idx; }
+
+
+        void setCluster(zim::writer::Cluster* _cluster)
+        {
+          ASSERT(isArticle(), ==, true);
+          cluster = _cluster;
+          info.d.blobNumber = _cluster->count();
+        }
+
+        cluster_index_t getClusterNumber() const {
+          return cluster ? cluster->getClusterIndex() : info.d.clusterNumber;
+        }
+        blob_index_t  getBlobNumber() const {
+          return isRedirect() ? blob_index_t(0) : info.d.blobNumber;
+        }
+
+        bool isRedirect() const                 { return mimeType == redirectMimeType; }
+        bool isLinktarget() const               { return mimeType == linktargetMimeType; }
+        bool isDeleted() const                  { return mimeType == deletedMimeType; }
+        bool isArticle() const                  { return !isRedirect() && !isLinktarget() && !isDeleted(); }
+        uint16_t getMimeType() const            { return mimeType; }
+        size_t getDirentSize() const
+        {
+          size_t ret = (isRedirect() ? 12 : 16) + url.getUrl().size() + 2;
+          if (title != url.getUrl())
+            ret += title.size();
+          return ret;
+        }
+
+        offset_t getOffset() const { return offset; }
+        void setOffset(offset_t o) { offset = o; }
+
+        void setArticle(uint16_t mimeType_, cluster_index_t clusterNumber_, blob_index_t blobNumber_)
+        {
+          ASSERT(mimeType, ==, 0);
+          mimeType = mimeType_;
+          info.d.clusterNumber = clusterNumber_;
+          info.d.blobNumber = blobNumber_;
+        }
+
+        void write(int out_fd) const;
+
+        friend bool compareUrl(const Dirent* d1, const Dirent* d2);
+        friend inline bool compareTitle(const Dirent* d1, const Dirent* d2);
+    };
+
+
+    inline bool compareUrl(const Dirent* d1, const Dirent* d2)
+    {
+      return d1->url < d2->url;
+    }
+    inline bool compareTitle(const Dirent* d1, const Dirent* d2)
+    {
+      return d1->url.getNs() < d2->url.getNs()
+        || (d1->url.getNs() == d2->url.getNs() && d1->getTitle() < d2->getTitle());
+    }
+  }
+}
+
+#endif // ZIM_WRITER_DIRENT_H
+
diff --git a/src/writer/article.cpp b/src/writer/article.cpp

new file mode 100644 (file)

index 0000000..bb62f34
--- /dev/null
+++ b/src/writer/article.cpp
@@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/blob.h>
+#include <zim/writer/article.h>
+
+namespace zim
+{
+  namespace writer
+  {
+    bool Article::isLinktarget() const
+    {
+      return false;
+    }
+
+    bool Article::isDeleted() const
+    {
+      return false;
+    }
+
+    std::string Article::getNextCategory()
+    {
+      return std::string();
+    }
+
+  }
+}
diff --git a/src/writer/cluster.cpp b/src/writer/cluster.cpp

new file mode 100644 (file)

index 0000000..1e3c66a
--- /dev/null
+++ b/src/writer/cluster.cpp
@@ -0,0 +1,292 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "cluster.h"
+#include "../log.h"
+#include "../endian_tools.h"
+#include "../debug.h"
+#include "../compression.h"
+
+#include <sstream>
+#include <fstream>
+
+#include <fcntl.h>
+#include <stdexcept>
+
+#ifdef _WIN32
+# include <io.h>
+#else
+# include <unistd.h>
+# define _write(fd, addr, size) ::write((fd), (addr), (size))
+#endif
+
+namespace zim {
+namespace writer {
+
+Cluster::Cluster(CompressionType compression)
+  : compression(compression),
+    isExtended(false),
+    _size(0)
+{
+  blobOffsets.push_back(offset_t(0));
+  pthread_mutex_init(&m_closedMutex,NULL);
+}
+
+Cluster::~Cluster() {
+  pthread_mutex_destroy(&m_closedMutex);
+  if (compressed_data.data()) {
+    delete[] compressed_data.data();
+  }
+}
+
+void Cluster::clear() {
+  Offsets().swap(blobOffsets);
+  ClusterData().swap(_data);
+}
+
+void Cluster::close() {
+  if (getCompression() != zim::zimcompDefault
+    && getCompression() != zim::zimcompNone) {
+
+    // We must compress the content in a buffer.
+    compress();
+  }
+  pthread_mutex_lock(&m_closedMutex);
+  closed = true;
+  pthread_mutex_unlock(&m_closedMutex);
+}
+
+bool Cluster::isClosed() const{
+  bool v;
+  pthread_mutex_lock(&m_closedMutex);
+  v = closed;
+  pthread_mutex_unlock(&m_closedMutex);
+  return v;
+}
+
+zsize_t Cluster::size() const
+{
+  if (isClosed()) {
+    throw std::runtime_error("oups");
+  }
+  if (isExtended) {
+    return zsize_t(blobOffsets.size() * sizeof(uint64_t)) + _size;
+  } else {
+    return zsize_t(blobOffsets.size() * sizeof(uint32_t)) + _size;
+  }
+}
+
+template<typename OFFSET_TYPE>
+void Cluster::write_offsets(writer_t writer) const
+{
+  size_type delta = blobOffsets.size() * sizeof(OFFSET_TYPE);
+  char out_buf[sizeof(OFFSET_TYPE)];
+  for (auto offset : blobOffsets)
+  {
+    offset.v += delta;
+    toLittleEndian(static_cast<OFFSET_TYPE>(offset.v), out_buf);
+    writer(Blob(out_buf, sizeof(OFFSET_TYPE)));
+  }
+}
+
+void Cluster::write_content(writer_t writer) const
+{
+  if (isExtended) {
+    write_offsets<uint64_t>(writer);
+  } else {
+    write_offsets<uint32_t>(writer);
+  }
+  write_data(writer);
+}
+
+void Cluster::compress()
+{
+  auto comp = getCompression();
+  switch(comp) {
+    case zim::zimcompBzip2:
+#if !defined(ENABLE_ZLIB)
+    case zim::zimcompZip:
+#endif
+#if !defined(ENABLE_ZSTD)
+    case zim::zimcompZstd:
+#endif
+      {
+        throw std::runtime_error("Compression method not enabled in this library");
+        break;
+      }
+
+    case zim::zimcompLzma:
+      {
+        _compress<LZMA_INFO>();
+        break;
+      }
+
+#if defined(ENABLE_ZLIB)
+    case zim::zimcompZip:
+      {
+        _compress<ZIP_INFO>();
+        break;
+      }
+#endif
+
+#if defined(ENABLE_ZSTD)
+    case zim::zimcompZstd:
+      {
+        _compress<ZSTD_INFO>();
+        break;
+      }
+#endif
+
+    default:
+      throw std::runtime_error("We cannot compress an uncompressed cluster");
+  };
+}
+
+template<typename COMP_TYPE>
+void Cluster::_compress()
+{
+  Compressor<COMP_TYPE> runner;
+  bool first = true;
+  auto writer = [&](const Blob& data) -> void {
+    if (first) {
+      runner.init((char*)data.data());
+      first = false;
+    }
+    runner.feed(data.data(), data.size());
+  };
+  write_content(writer);
+  zsize_t size;
+  auto comp = runner.get_data(&size);
+  compressed_data = Blob(comp.release(), size.v);
+}
+
+void Cluster::write(int out_fd) const
+{
+  // write clusterInfo
+  char clusterInfo = 0;
+  if (isExtended) {
+    clusterInfo = 0x10;
+  }
+  clusterInfo += getCompression();
+  if (_write(out_fd, &clusterInfo, 1) == -1) {
+    throw std::runtime_error("Error writng");
+  }
+
+  // Open a comprestion stream if needed
+  switch(getCompression())
+  {
+    case zim::zimcompDefault:
+    case zim::zimcompNone:
+    {
+      auto writer = [=](const Blob& data) -> void {
+        // Ideally we would simply have to do :
+        // ::write(tmp_fd, data.c_str(), data.size());
+        // However, the data can be pretty big (> 4Gb), especially with test,
+        // And ::write fails to write data > 4Gb. So we have to chunck the write.
+        size_type to_write = data.size();
+        const char* src = data.data();
+        while (to_write) {
+         size_type chunk_size = to_write > 4096 ? 4096 : to_write;
+         auto ret = _write(out_fd, src, chunk_size);
+         src += ret;
+         to_write -= ret;
+        }
+      };
+      write_content(writer);
+      break;
+    }
+
+    case zim::zimcompZip:
+    case zim::zimcompBzip2:
+    case zim::zimcompLzma:
+    case zim::zimcompZstd:
+      {
+        log_debug("compress data");
+        if (_write(out_fd, compressed_data.data(), compressed_data.size()) == -1) {
+          throw std::runtime_error("Error writing");
+        }
+        delete [] compressed_data.data();
+        compressed_data = Blob();
+        break;
+      }
+
+    default:
+      std::ostringstream msg;
+      msg << "invalid compression flag " << getCompression();
+      log_error(msg.str());
+      throw std::runtime_error(msg.str());
+  }
+}
+
+void Cluster::addArticle(const zim::writer::Article* article)
+{
+  auto filename = article->getFilename();
+  auto size = article->getSize();
+  _size += size;
+  blobOffsets.push_back(offset_t(_size.v));
+  isExtended |= (size>UINT32_MAX);
+  if (size == 0)
+    return;
+
+  if (filename.empty()) {
+    _data.emplace_back(DataType::plain, article->getData());
+  }
+  else {
+    _data.emplace_back(DataType::file, filename);
+  }
+}
+
+void Cluster::addData(const char* data, zsize_t size)
+{
+  _size += size;
+  blobOffsets.push_back(offset_t(_size.v));
+  isExtended |= (size.v>UINT32_MAX);
+  if (size.v == 0)
+    return;
+
+  _data.emplace_back(DataType::plain, data, size.v);
+}
+
+void Cluster::write_data(writer_t writer) const
+{
+  for (auto& data: _data)
+  {
+    ASSERT(data.value.empty(), ==, false);
+    if (data.type == DataType::plain) {
+      writer(Blob(data.value.c_str(), data.value.size()));
+    } else {
+      int fd = open(data.value.c_str(), O_RDONLY);
+      if (fd == -1) {
+        throw std::runtime_error(std::string("cannot open ") + data.value);
+      }
+      char* buffer = new char[1024*1024];
+      while (true) {
+        auto r = read(fd, buffer, 1024*1024);
+        if (!r)
+          break;
+        writer(Blob(buffer, r));
+      }
+      delete [] buffer;
+      ::close(fd);
+    }
+  }
+}
+
+} // writer
+} // zim
diff --git a/src/writer/cluster.h b/src/writer/cluster.h

new file mode 100644 (file)

index 0000000..ca07be4
--- /dev/null
+++ b/src/writer/cluster.h
@@ -0,0 +1,109 @@
+/*
+ * Copyright (C) 2017 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_CLUSTER_H_
+#define ZIM_WRITER_CLUSTER_H_
+
+#include <zim/zim.h>
+#include <zim/blob.h>
+#include <iostream>
+#include <vector>
+#include <pthread.h>
+#include <functional>
+
+#include <zim/writer/article.h>
+#include "../zim_types.h"
+
+namespace zim {
+
+namespace writer {
+
+enum class DataType { plain, file };
+struct Data {
+  Data(zim::writer::DataType type, const std::string& value) :
+    type(type), value(value) {}
+  Data(zim::writer::DataType type, const char* data, zim::size_type size) :
+    type(type), value(data, size) {}
+  DataType type;
+  std::string value;
+};
+
+using writer_t = std::function<void(const Blob& data)>;
+
+class Cluster {
+  typedef std::vector<offset_t> Offsets;
+  typedef std::vector<Data> ClusterData;
+
+
+  public:
+    Cluster(CompressionType compression);
+    virtual ~Cluster();
+
+    void setCompression(CompressionType c) { compression = c; }
+    CompressionType getCompression() const { return compression; }
+
+    void addArticle(const zim::writer::Article* article);
+    void addData(const char* data, zsize_t size);
+
+    blob_index_t count() const  { return blob_index_t(blobOffsets.size() - 1); }
+    zsize_t size() const;
+    offset_t getOffset() const { return offset; }
+    void setOffset(offset_t o) { offset = o; }
+    bool is_extended() const { return isExtended; }
+    void clear();
+    void close();
+    bool isClosed() const;
+
+    void setClusterIndex(cluster_index_t idx) { index = idx; }
+    cluster_index_t getClusterIndex() const { return index; }
+
+    zsize_t getBlobSize(blob_index_t n) const
+    { return zsize_t(blobOffsets[blob_index_type(n)+1].v - blobOffsets[blob_index_type(n)].v); }
+
+    void write(int out_fd) const;
+
+  protected:
+    CompressionType compression;
+    cluster_index_t index;
+    bool isExtended;
+    Offsets blobOffsets;
+    offset_t offset;
+    zsize_t _size;
+    ClusterData _data;
+    mutable Blob compressed_data;
+    std::string tmp_filename;
+    mutable pthread_mutex_t m_closedMutex;
+    bool closed = false;
+
+  private:
+    void write_content(writer_t writer) const;
+    template<typename OFFSET_TYPE>
+    void write_offsets(writer_t writer) const;
+    void write_data(writer_t writer) const;
+    void compress();
+    template<typename COMP_INFO>
+    void _compress();
+};
+
+};
+
+};
+
+
+#endif //ZIM_WRITER_CLUSTER_H_
diff --git a/src/writer/creator.cpp b/src/writer/creator.cpp

new file mode 100644 (file)

index 0000000..7358968
--- /dev/null
+++ b/src/writer/creator.cpp
@@ -0,0 +1,638 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "config.h"
+
+#include "creatordata.h"
+#include "cluster.h"
+#include "debug.h"
+#include "workers.h"
+#include <zim/blob.h>
+#include <zim/writer/creator.h>
+#include "../endian_tools.h"
+#include <algorithm>
+#include <fstream>
+#include "../md5.h"
+
+#if defined(ENABLE_XAPIAN)
+  #include "xapianIndexer.h"
+#endif
+
+#ifdef _WIN32
+# include <io.h>
+#else
+# include <unistd.h>
+# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \
+{throw std::runtime_error("Error writing");}
+#endif
+
+#include <sys/stat.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <limits>
+#include <stdexcept>
+#include <sstream>
+#include <ctime>
+#include "log.h"
+#include "../fs.h"
+#include "../tools.h"
+
+log_define("zim.writer.creator")
+
+#define INFO(e) \
+    do { \
+        log_info(e); \
+        std::cout << e << std::endl; \
+    } while(false)
+
+#define TINFO(e) \
+    if (verbose) { \
+        double seconds = difftime(time(NULL), data->start_time); \
+        std::cout << "T:" << (int)(seconds) \
+                  << "; " << e << std::endl; \
+    }
+
+#define CLUSTER_BASE_OFFSET 1024
+
+namespace zim
+{
+  namespace writer
+  {
+    Creator::Creator(bool verbose)
+      : verbose(verbose)
+    {}
+
+    Creator::~Creator() = default;
+
+    void Creator::startZimCreation(const std::string& fname)
+    {
+      data = std::unique_ptr<CreatorData>(new CreatorData(fname, verbose, withIndex, indexingLanguage));
+      data->setMinChunkSize(minChunkSize);
+
+      for(unsigned i=0; i<nbWorkerThreads; i++)
+      {
+        pthread_t thread;
+        pthread_create(&thread, NULL, taskRunner, this->data.get());
+        data->workerThreads.push_back(thread);
+      }
+
+      pthread_create(&data->writerThread, NULL, clusterWriter, this->data.get());
+    }
+
+    void Creator::addArticle(std::shared_ptr<Article> article)
+    {
+      auto dirent = data->createDirentFromArticle(article.get());
+      data->addDirent(dirent, article.get());
+      data->nbArticles++;
+      if (article->isRedirect()) {
+        data->nbRedirectArticles++;
+      } else {
+        if (article->shouldCompress())
+          data->nbCompArticles++;
+        else
+          data->nbUnCompArticles++;
+        if (!article->getFilename().empty())
+          data->nbFileArticles++;
+        if (article->shouldIndex())
+          data->nbIndexArticles++;
+      }
+      if (verbose && data->nbArticles%1000 == 0){
+        double seconds = difftime(time(NULL),data->start_time);
+        std::cout << "T:" << (int)seconds
+                  << "; A:" << data->nbArticles
+                  << "; RA:" << data->nbRedirectArticles
+                  << "; CA:" << data->nbCompArticles
+                  << "; UA:" << data->nbUnCompArticles
+                  << "; FA:" << data->nbFileArticles
+                  << "; IA:" << data->nbIndexArticles
+                  << "; C:" << data->nbClusters
+                  << "; CC:" << data->nbCompClusters
+                  << "; UC:" << data->nbUnCompClusters
+                  << "; WC:" << data->taskList.size()
+                  << std::endl;
+      }
+
+#if defined(ENABLE_XAPIAN)
+      if (article->shouldIndex()) {
+        data->titleIndexer.index(article.get());
+        if(withIndex && !article->isRedirect()) {
+          data->taskList.pushToQueue(new IndexTask(article));
+        }
+      }
+#endif
+    }
+
+    void Creator::finishZimCreation()
+    {
+      if (verbose) {
+        double seconds = difftime(time(NULL),data->start_time);
+        std::cout << "T:" << (int)seconds
+                  << "; A:" << data->nbArticles
+                  << "; RA:" << data->nbRedirectArticles
+                  << "; CA:" << data->nbCompArticles
+                  << "; UA:" << data->nbUnCompArticles
+                  << "; FA:" << data->nbFileArticles
+                  << "; IA:" << data->nbIndexArticles
+                  << "; C:" << data->nbClusters
+                  << "; CC:" << data->nbCompClusters
+                  << "; UC:" << data->nbUnCompClusters
+                  << "; WC:" << data->taskList.size()
+                  << std::endl;
+      }
+
+      // We need to wait that all indexation task has been done before closing the
+      // xapian database and add it to zim.
+      unsigned int wait = 0;
+      do {
+        microsleep(wait);
+        wait += 10;
+      } while(IndexTask::waiting_task.load() > 0);
+
+#if defined(ENABLE_XAPIAN)
+      {
+        data->titleIndexer.indexingPostlude();
+        auto article = data->titleIndexer.getMetaArticle();
+        auto dirent = data->createDirentFromArticle(article);
+        data->addDirent(dirent, article);
+        delete article;
+      }
+      if (withIndex) {
+        wait = 0;
+        do {
+          microsleep(wait);
+          wait += 10;
+        } while(IndexTask::waiting_task.load() > 0);
+
+        data->indexer->indexingPostlude();
+        microsleep(100);
+        auto article = data->indexer->getMetaArticle();
+        auto dirent = data->createDirentFromArticle(article);
+        data->addDirent(dirent, article);
+        delete article;
+      }
+#endif
+
+      // When we've seen all articles, write any remaining clusters.
+      if (data->compCluster->count())
+        data->closeCluster(true);
+
+      if (data->uncompCluster->count())
+        data->closeCluster(false);
+
+      TINFO("Waiting for workers");
+      // wait all cluster compression has been done
+      wait = 0;
+      do {
+        microsleep(wait);
+        wait += 10;
+      } while(ClusterTask::waiting_task.load() > 0);
+
+      // Quit all workerThreads
+      for (auto i=0U; i< nbWorkerThreads; i++) {
+        data->taskList.pushToQueue(nullptr);
+      }
+      for(auto& thread: data->workerThreads) {
+        pthread_join(thread, nullptr);
+      }
+
+      // Wait for writerThread to finish.
+      data->clusterToWrite.pushToQueue(nullptr);
+      pthread_join(data->writerThread, nullptr);
+
+      TINFO("ResolveRedirectIndexes");
+      data->resolveRedirectIndexes();
+
+      TINFO("Set article indexes");
+      data->setArticleIndexes();
+
+      TINFO("Resolve mimetype");
+      data->resolveMimeTypes();
+
+      TINFO("create title index");
+      data->createTitleIndex();
+      TINFO(data->dirents.size() << " title index created");
+      TINFO(data->clustersList.size() << " clusters created");
+
+      TINFO("write zimfile :");
+      write();
+      ::close(data->out_fd);
+
+      TINFO("rename tmpfile to final one.");
+      DEFAULTFS::rename(data->basename+".zim.tmp", data->basename+".zim");
+
+      TINFO("finish");
+    }
+
+    void Creator::fillHeader(Fileheader* header) const
+    {
+      auto mainUrl = getMainUrl();
+      auto layoutUrl = getLayoutUrl();
+
+      if (data->isExtended) {
+        header->setMajorVersion(Fileheader::zimExtendedMajorVersion);
+      } else {
+        header->setMajorVersion(Fileheader::zimClassicMajorVersion);
+      }
+      header->setMinorVersion(Fileheader::zimMinorVersion);
+      header->setMainPage(std::numeric_limits<article_index_type>::max());
+      header->setLayoutPage(std::numeric_limits<article_index_type>::max());
+
+      if (!mainUrl.empty() || !layoutUrl.empty())
+      {
+        for (auto& dirent: data->dirents)
+        {
+          if (mainUrl == dirent->getFullUrl())
+          {
+            header->setMainPage(article_index_type(dirent->getIdx()));
+          }
+
+          if (layoutUrl == dirent->getFullUrl())
+          {
+            header->setLayoutPage(article_index_type(dirent->getIdx()));
+          }
+        }
+      }
+
+      header->setUuid( getUuid() );
+      header->setArticleCount( data->dirents.size() );
+
+      header->setMimeListPos( Fileheader::size );
+
+      header->setClusterCount( data->clustersList.size() );
+    }
+
+    void Creator::write() const
+    {
+      Fileheader header;
+      fillHeader(&header);
+
+      int out_fd = data->out_fd;
+
+      lseek(out_fd, header.getMimeListPos(), SEEK_SET);
+      TINFO(" write mimetype list");
+      for(auto& mimeType: data->mimeTypesList)
+      {
+        _write(out_fd, mimeType.c_str(), mimeType.size()+1);
+      }
+
+      _write(out_fd, "", 1);
+
+      ASSERT(lseek(out_fd, 0, SEEK_CUR), <, CLUSTER_BASE_OFFSET);
+
+      TINFO(" write directory entries");
+      lseek(out_fd, 0, SEEK_END);
+      for (Dirent* dirent: data->dirents)
+      {
+        dirent->setOffset(offset_t(lseek(out_fd, 0, SEEK_CUR)));
+        dirent->write(out_fd);
+      }
+
+      TINFO(" write url prt list");
+      header.setUrlPtrPos(lseek(out_fd, 0, SEEK_CUR));
+      for (auto& dirent: data->dirents)
+      {
+        char tmp_buff[sizeof(offset_type)];
+        toLittleEndian(dirent->getOffset(), tmp_buff);
+        _write(out_fd, tmp_buff, sizeof(offset_type));
+      }
+
+      TINFO(" write title index");
+      header.setTitleIdxPos(lseek(out_fd, 0, SEEK_CUR));
+      for (Dirent* dirent: data->titleIdx)
+      {
+        char tmp_buff[sizeof(article_index_type)];
+        toLittleEndian(dirent->getIdx().v, tmp_buff);
+        _write(out_fd, tmp_buff, sizeof(article_index_type));
+      }
+
+      TINFO(" write cluster offset list");
+      header.setClusterPtrPos(lseek(out_fd, 0, SEEK_CUR));
+      for (auto cluster : data->clustersList)
+      {
+        char tmp_buff[sizeof(offset_type)];
+        toLittleEndian(cluster->getOffset(), tmp_buff);
+        _write(out_fd, tmp_buff, sizeof(offset_type));
+      }
+
+      header.setChecksumPos(lseek(out_fd, 0, SEEK_CUR));
+
+      TINFO(" write header");
+      lseek(out_fd, 0, SEEK_SET);
+      header.write(out_fd);
+
+      TINFO(" write checksum");
+      struct zim_MD5_CTX md5ctx;
+      unsigned char batch_read[1024+1];
+      lseek(out_fd, 0, SEEK_SET);
+      zim_MD5Init(&md5ctx);
+      while (true) {
+         auto r = read(out_fd, batch_read, 1024);
+         if (r == -1) {
+           perror("Cannot read");
+           throw std::runtime_error("oups");
+         }
+         if (r == 0)
+           break;
+         batch_read[r] = 0;
+         zim_MD5Update(&md5ctx, batch_read, r);
+      }
+      unsigned char digest[16];
+      zim_MD5Final(digest, &md5ctx);
+      _write(out_fd, reinterpret_cast<const char*>(digest), 16);
+    }
+
+    CreatorData::CreatorData(const std::string& fname,
+                                   bool verbose,
+                                   bool withIndex,
+                                   std::string language)
+      : withIndex(withIndex),
+        indexingLanguage(language),
+#if defined(ENABLE_XAPIAN)
+        titleIndexer(language, IndexingMode::TITLE, true),
+#endif
+        verbose(verbose),
+        nbArticles(0),
+        nbRedirectArticles(0),
+        nbCompArticles(0),
+       nbUnCompArticles(0),
+       nbFileArticles(0),
+       nbIndexArticles(0),
+       nbClusters(0),
+       nbCompClusters(0),
+       nbUnCompClusters(0),
+        start_time(time(NULL))
+    {
+      basename =  (fname.size() > 4 && fname.compare(fname.size() - 4, 4, ".zim") == 0)
+                        ? fname.substr(0, fname.size() - 4)
+                        : fname;
+      auto zim_name = basename + ".zim.tmp";
+#ifdef _WIN32
+int mode =  _S_IREAD | _S_IWRITE;
+#else
+      mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH;
+#endif
+      out_fd = open(zim_name.c_str(), O_RDWR|O_CREAT|O_TRUNC, mode);
+      if (out_fd == -1){
+        perror(nullptr);
+        std::ostringstream ss;
+        ss << "Cannot create file " << zim_name;
+        throw std::runtime_error(ss.str());
+      }
+      if(lseek(out_fd, CLUSTER_BASE_OFFSET, SEEK_SET) != CLUSTER_BASE_OFFSET) {
+        close(out_fd);
+        perror(nullptr);
+        throw std::runtime_error("Impossible to seek in file");
+      }
+
+      // We keep both a "compressed cluster" and an "uncompressed cluster"
+      // because we don't know which one will fill up first.  We also need
+      // to track the dirents currently in each, so we can fix up the
+      // cluster index if the other one ends up written first.
+      compCluster = new Cluster(compression);
+      uncompCluster = new Cluster(zimcompNone);
+
+#if defined(ENABLE_XAPIAN)
+      titleIndexer.indexingPrelude(basename+"_title.idx");
+      if (withIndex) {
+          indexer = new XapianIndexer(indexingLanguage, IndexingMode::FULL, true);
+          indexer->indexingPrelude(basename+".idx");
+      }
+#endif
+    }
+
+    CreatorData::~CreatorData()
+    {
+      if (compCluster)
+        delete compCluster;
+      if (uncompCluster)
+        delete uncompCluster;
+      for(auto& cluster: clustersList) {
+        delete cluster;
+      }
+#if defined(ENABLE_XAPIAN)
+      if (indexer)
+        delete indexer;
+#endif
+    }
+
+    void CreatorData::addDirent(Dirent* dirent, const Article* article)
+    {
+      auto ret = dirents.insert(dirent);
+      if (!ret.second) {
+        Dirent* existing = *ret.first;
+        if (existing->isRedirect() && !dirent->isRedirect()) {
+          unresolvedRedirectDirents.erase(existing);
+          dirents.erase(ret.first);
+          dirents.insert(dirent);
+        } else {
+          std::cerr << "Impossible to add " << dirent->getFullUrl().getLongUrl() << std::endl;
+          std::cerr << "  dirent's title to add is : " << dirent->getTitle() << std::endl;
+          std::cerr << "  existing dirent's title is : " << existing->getTitle() << std::endl;
+          return;
+        }
+      };
+
+      // If this is a redirect, we're done: there's no blob to add.
+      if (dirent->isRedirect())
+      {
+        unresolvedRedirectDirents.insert(dirent);
+        return;
+      }
+
+      // Add blob data to compressed or uncompressed cluster.
+      auto articleSize = article->getSize();
+      if (articleSize > 0)
+      {
+        isEmpty = false;
+      }
+
+      Cluster *cluster;
+      if (article->shouldCompress())
+      {
+        cluster = compCluster;
+      }
+      else
+      {
+        cluster = uncompCluster;
+      }
+
+      // If cluster will be too large, write it to dis, and open a new
+      // one for the content.
+      if ( cluster->count()
+        && cluster->size().v+articleSize >= minChunkSize * 1024
+         )
+      {
+        log_info("cluster with " << cluster->count() << " articles, " <<
+                 cluster->size() << " bytes; current title \"" <<
+                 dirent->getTitle() << '\"');
+        cluster = closeCluster(article->shouldCompress());
+      }
+
+      dirent->setCluster(cluster);
+      cluster->addArticle(article);
+    }
+
+    Dirent* CreatorData::createDirentFromArticle(const Article* article)
+    {
+      auto dirent = pool.getDirent();
+      dirent->setUrl(article->getUrl());
+      dirent->setTitle(article->getTitle());
+
+      if (article->isRedirect())
+      {
+        dirent->setRedirect(nullptr);
+        dirent->setRedirectUrl(article->getRedirectUrl());
+      }
+      else if (article->isLinktarget())
+      {
+        dirent->setLinktarget();
+      }
+      else if (article->isDeleted())
+      {
+        dirent->setDeleted();
+      }
+      else
+      {
+        auto mimetype = article->getMimeType();
+        if (mimetype.empty()) {
+          std::cerr << "Warning, " << article->getUrl().getLongUrl() << " have empty mimetype." << std::endl;
+          mimetype = "application/octet-stream";
+        }
+        dirent->setMimeType(getMimeTypeIdx(mimetype));
+      }
+      return dirent;
+    }
+
+    Cluster* CreatorData::closeCluster(bool compressed)
+    {
+      Cluster *cluster;
+      nbClusters++;
+      if (compressed )
+      {
+        cluster = compCluster;
+        nbCompClusters++;
+      } else {
+        cluster = uncompCluster;
+        nbUnCompClusters++;
+      }
+      cluster->setClusterIndex(cluster_index_t(clustersList.size()));
+      clustersList.push_back(cluster);
+      taskList.pushToQueue(new ClusterTask(cluster));
+      clusterToWrite.pushToQueue(cluster);
+
+      if (cluster->is_extended() )
+        isExtended = true;
+      if (compressed)
+      {
+        cluster = compCluster = new Cluster(compression);
+      } else {
+        cluster = uncompCluster = new Cluster(zimcompNone);
+      }
+      return cluster;
+    }
+
+    void CreatorData::setArticleIndexes()
+    {
+      // set index
+      INFO("set index");
+      article_index_t idx(0);
+      for (auto& dirent: dirents) {
+        dirent->setIdx(idx);
+        idx += 1;
+      }
+    }
+
+    void CreatorData::resolveRedirectIndexes()
+    {
+      // translate redirect aid to index
+      INFO("Resolve redirect");
+      for (auto dirent: unresolvedRedirectDirents)
+      {
+        Dirent tmpDirent(dirent->getRedirectUrl());
+        auto target_pos = dirents.find(&tmpDirent);
+        if(target_pos == dirents.end()) {
+          INFO("Invalid redirection " << dirent->getFullUrl().getLongUrl() << " redirecting to (missing) " << dirent->getRedirectUrl().getLongUrl());
+          dirents.erase(dirent);
+        } else  {
+          dirent->setRedirect(*target_pos);
+        }
+      }
+    }
+
+    void CreatorData::createTitleIndex()
+    {
+      titleIdx.clear();
+      for (auto dirent: dirents)
+        titleIdx.insert(dirent);
+    }
+
+    void CreatorData::resolveMimeTypes()
+    {
+      std::vector<std::string> oldMImeList;
+      std::vector<uint16_t> mapping;
+
+      for (auto& rmimeType: rmimeTypesMap)
+      {
+        oldMImeList.push_back(rmimeType.second);
+        mimeTypesList.push_back(rmimeType.second);
+      }
+
+      mapping.resize(oldMImeList.size());
+      std::sort(mimeTypesList.begin(), mimeTypesList.end());
+
+      for (unsigned i=0; i<oldMImeList.size(); ++i)
+      {
+        for (unsigned j=0; j<mimeTypesList.size(); ++j)
+        {
+          if (oldMImeList[i] == mimeTypesList[j])
+            mapping[i] = static_cast<uint16_t>(j);
+        }
+      }
+
+      for (auto& dirent: dirents)
+      {
+        if (dirent->isArticle())
+          dirent->setMimeType(mapping[dirent->getMimeType()]);
+      }
+    }
+
+    uint16_t CreatorData::getMimeTypeIdx(const std::string& mimeType)
+    {
+      auto it = mimeTypesMap.find(mimeType);
+      if (it == mimeTypesMap.end())
+      {
+        if (nextMimeIdx >= std::numeric_limits<uint16_t>::max())
+          throw std::runtime_error("too many distinct mime types");
+        mimeTypesMap[mimeType] = nextMimeIdx;
+        rmimeTypesMap[nextMimeIdx] = mimeType;
+        return nextMimeIdx++;
+      }
+
+      return it->second;
+    }
+
+    const std::string& CreatorData::getMimeType(uint16_t mimeTypeIdx) const
+    {
+      auto it = rmimeTypesMap.find(mimeTypeIdx);
+      if (it == rmimeTypesMap.end())
+        throw std::runtime_error("mime type index not found");
+      return it->second;
+    }
+  }
+}
diff --git a/src/writer/creatordata.h b/src/writer/creatordata.h

new file mode 100644 (file)

index 0000000..f72eda3
--- /dev/null
+++ b/src/writer/creatordata.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_CREATOR_DATA_H
+#define ZIM_WRITER_CREATOR_DATA_H
+
+#include <zim/fileheader.h>
+#include <zim/writer/article.h>
+#include "queue.h"
+#include "_dirent.h"
+#include "workers.h"
+#include "xapianIndexer.h"
+#include <vector>
+#include <map>
+#include <fstream>
+#include "config.h"
+
+#include "direntPool.h"
+
+#if defined(ENABLE_XAPIAN)
+  class XapianIndexer;
+#endif
+
+namespace zim
+{
+  namespace writer
+  {
+    struct UrlCompare {
+      bool operator() (const Dirent* d1, const Dirent* d2) const {
+        return compareUrl(d1, d2);
+      }
+    };
+
+    struct TitleCompare {
+      bool operator() (const Dirent* d1, const Dirent* d2) const {
+        return compareTitle(d1, d2);
+      }
+    };
+
+
+    class Cluster;
+    class CreatorData
+    {
+      public:
+        typedef std::set<Dirent*, UrlCompare> UrlSortedDirents;
+        typedef std::multiset<Dirent*, TitleCompare> TitleSortedDirents;
+        typedef std::map<std::string, uint16_t> MimeTypesMap;
+        typedef std::map<uint16_t, std::string> RMimeTypesMap;
+        typedef std::vector<std::string> MimeTypesList;
+        typedef std::vector<Cluster*> ClusterList;
+        typedef Queue<Cluster*> ClusterQueue;
+        typedef Queue<Task*> TaskQueue;
+        typedef std::vector<pthread_t> ThreadList;
+
+        CreatorData(const std::string& fname, bool verbose,
+                       bool withIndex, std::string language);
+        virtual ~CreatorData();
+
+        void addDirent(Dirent* dirent, const Article* article);
+        Dirent* createDirentFromArticle(const Article* article);
+        Cluster* closeCluster(bool compressed);
+
+        void setArticleIndexes();
+        void resolveRedirectIndexes();
+        void createTitleIndex();
+        void resolveMimeTypes();
+
+        uint16_t getMimeTypeIdx(const std::string& mimeType);
+        const std::string& getMimeType(uint16_t mimeTypeIdx) const;
+
+        size_t minChunkSize = 1024-64;
+
+        DirentPool  pool;
+
+        UrlSortedDirents   dirents;
+        UrlSortedDirents   unresolvedRedirectDirents;
+        TitleSortedDirents titleIdx;
+
+        MimeTypesMap mimeTypesMap;
+        RMimeTypesMap rmimeTypesMap;
+        MimeTypesList mimeTypesList;
+        uint16_t nextMimeIdx = 0;
+
+        ClusterList clustersList;
+        ClusterQueue clusterToWrite;
+        TaskQueue taskList;
+        ThreadList workerThreads;
+        pthread_t  writerThread;
+        CompressionType compression = zimcompLzma;
+        std::string basename;
+        bool isEmpty = true;
+        bool isExtended = false;
+        zsize_t clustersSize;
+        Cluster *compCluster = nullptr;
+        Cluster *uncompCluster = nullptr;
+        int out_fd;
+
+        bool withIndex;
+        std::string indexingLanguage;
+#if defined(ENABLE_XAPIAN)
+        XapianIndexer titleIndexer;
+        XapianIndexer* indexer = nullptr;
+#endif
+
+        // Some stats
+        bool verbose;
+        article_index_type nbArticles;
+        article_index_type nbRedirectArticles;
+        article_index_type nbCompArticles;
+        article_index_type nbUnCompArticles;
+        article_index_type nbFileArticles;
+        article_index_type nbIndexArticles;
+        cluster_index_type nbClusters;
+        cluster_index_type nbCompClusters;
+        cluster_index_type nbUnCompClusters;
+        time_t start_time;
+
+        cluster_index_t clusterCount() const
+        { return cluster_index_t(clustersList.size()); }
+
+        article_index_t articleCount() const
+        { return article_index_t(dirents.size()); }
+
+        size_t getMinChunkSize()    { return minChunkSize; }
+        void setMinChunkSize(size_t s)   { minChunkSize = s; }
+    };
+
+  }
+
+}
+
+#endif // ZIM_WRITER_CREATOR_DATA_H
diff --git a/src/writer/dirent.cpp b/src/writer/dirent.cpp

new file mode 100644 (file)

index 0000000..d13485c
--- /dev/null
+++ b/src/writer/dirent.cpp
@@ -0,0 +1,77 @@
+/*
+ * Copyright (C) 2006 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "_dirent.h"
+#include <zim/zim.h>
+#include "buffer.h"
+#include "endian_tools.h"
+#include "log.h"
+#include <algorithm>
+#include <cstring>
+#ifdef _WIN32
+# include <io.h>
+#else
+# include <unistd.h>
+# define _write(fd, addr, size) if(::write((fd), (addr), (size)) != (ssize_t)(size)) \
+{throw std::runtime_error("Error writing");}
+#endif
+
+log_define("zim.dirent")
+
+void zim::writer::Dirent::write(int out_fd) const
+{
+  union
+  {
+    char d[16];
+    long a;
+  } header;
+  zim::toLittleEndian(getMimeType(), header.d);
+  header.d[2] = 0; // parameter size
+  header.d[3] = getNamespace();
+
+  log_debug("title=" << dirent.getTitle() << " title.size()=" << dirent.getTitle().size());
+
+  zim::toLittleEndian(getVersion(), header.d + 4);
+
+  if (isRedirect())
+  {
+    zim::toLittleEndian(getRedirectIndex().v, header.d + 8);
+    _write(out_fd, header.d, 12);
+  }
+  else if (isLinktarget() || isDeleted())
+  {
+    _write(out_fd, header.d, 8);
+  }
+  else
+  {
+    zim::toLittleEndian(zim::cluster_index_type(getClusterNumber()), header.d + 8);
+    zim::toLittleEndian(zim::blob_index_type(getBlobNumber()), header.d + 12);
+    _write(out_fd, header.d, 16);
+  }
+
+  auto& url = getUrl();
+  _write(out_fd, url.c_str(), url.size()+1);
+
+  std::string t = getTitle();
+  if (t != getUrl())
+    _write(out_fd, t.c_str(), t.size());
+  char c = 0;
+  _write(out_fd, &c, 1);
+
+}
diff --git a/src/writer/direntPool.h b/src/writer/direntPool.h

new file mode 100644 (file)

index 0000000..bc17da0
--- /dev/null
+++ b/src/writer/direntPool.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2019 Matthieu Gautier
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#ifndef ZIM_WRITER_DIRENTPOOL_H
+#define ZIM_WRITER_DIRENTPOOL_H
+
+#include "debug.h"
+#include "_dirent.h"
+
+namespace zim
+{
+  namespace writer {
+    class DirentPool {
+      private:
+        std::vector<Dirent*> pools;
+        uint16_t direntIndex;
+
+        void allocate_new_pool() {
+          pools.push_back(new Dirent[0xFFFF]);
+          direntIndex = 0;
+        }
+
+      public:
+        DirentPool() :
+          direntIndex(0xFFFF)
+        {}
+        ~DirentPool() {
+          for(auto direntArray: pools) {
+            delete[] direntArray;
+          }
+        }
+
+        Dirent* getDirent() {
+          if (direntIndex == 0xFFFF) {
+            allocate_new_pool();
+          }
+          return pools.back() + direntIndex++;
+        }
+    };
+  }
+}
+
+#endif // ZIM_WRITER_DIRENTPOLL_H
+
diff --git a/src/writer/queue.h b/src/writer/queue.h

new file mode 100644 (file)

index 0000000..c191bbf
--- /dev/null
+++ b/src/writer/queue.h
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2016 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_QUEUE_H
+#define OPENZIM_LIBZIM_QUEUE_H
+
+#define MAX_QUEUE_SIZE 10
+
+#include <pthread.h>
+#include <queue>
+#include "../tools.h"
+
+template<typename T>
+class Queue {
+    public:
+        Queue() {pthread_mutex_init(&m_queueMutex,NULL);};
+        virtual ~Queue() {pthread_mutex_destroy(&m_queueMutex);};
+        virtual bool isEmpty();
+        virtual size_t size();
+        virtual void pushToQueue(const T& element);
+        virtual bool getHead(T &element);
+        virtual bool popFromQueue(T &element);
+
+    protected:
+        std::queue<T>   m_realQueue;
+        pthread_mutex_t m_queueMutex;
+
+    private:
+        // Make this queue non copyable
+        Queue(const Queue&);
+        Queue& operator=(const Queue&);
+};
+
+template<typename T>
+bool Queue<T>::isEmpty() {
+    pthread_mutex_lock(&m_queueMutex);
+    bool retVal = m_realQueue.empty();
+    pthread_mutex_unlock(&m_queueMutex);
+    return retVal;
+}
+
+template<typename T>
+size_t Queue<T>::size() {
+    pthread_mutex_lock(&m_queueMutex);
+    size_t retVal = m_realQueue.size();
+    pthread_mutex_unlock(&m_queueMutex);
+    return retVal;
+}
+
+template<typename T>
+void Queue<T>::pushToQueue(const T &element) {
+    unsigned int wait = 0;
+    unsigned int queueSize = 0;
+
+    do {
+        zim::microsleep(wait);
+        pthread_mutex_lock(&m_queueMutex);
+        queueSize = m_realQueue.size();
+        pthread_mutex_unlock(&m_queueMutex);
+        wait += 10;
+    } while (queueSize > MAX_QUEUE_SIZE);
+
+    pthread_mutex_lock(&m_queueMutex);
+    m_realQueue.push(element);
+    pthread_mutex_unlock(&m_queueMutex);
+}
+
+template<typename T>
+bool Queue<T>::getHead(T &element) {
+  pthread_mutex_lock(&m_queueMutex);
+  if (m_realQueue.empty()) {
+    pthread_mutex_unlock(&m_queueMutex);
+    return false;
+  }
+  element = m_realQueue.front();
+  pthread_mutex_unlock(&m_queueMutex);
+  return true;
+}
+
+template<typename T>
+bool Queue<T>::popFromQueue(T &element) {
+    pthread_mutex_lock(&m_queueMutex);
+    if (m_realQueue.empty()) {
+        pthread_mutex_unlock(&m_queueMutex);
+        return false;
+    }
+
+    element = m_realQueue.front();
+    m_realQueue.pop();
+    pthread_mutex_unlock(&m_queueMutex);
+
+  return true;
+}
+
+#endif // OPENZIM_LIBZIM_QUEUE_H
diff --git a/src/writer/workers.cpp b/src/writer/workers.cpp

new file mode 100644 (file)

index 0000000..46d9066
--- /dev/null
+++ b/src/writer/workers.cpp
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "config.h"
+
+#include "creatordata.h"
+#include "cluster.h"
+#include "debug.h"
+#include <zim/blob.h>
+#include "../endian_tools.h"
+#include <algorithm>
+#include <fstream>
+
+#if defined(ENABLE_XAPIAN)
+  #include "xapianIndexer.h"
+#endif
+
+#ifdef _WIN32
+#include <io.h>
+#else
+#include <unistd.h>
+#endif
+
+#include <stdio.h>
+#include <limits>
+#include <stdexcept>
+#include <sstream>
+#include "log.h"
+#include "../fs.h"
+#include "../tools.h"
+
+static pthread_mutex_t s_dbaccessLock = PTHREAD_MUTEX_INITIALIZER;
+std::atomic<unsigned long> zim::writer::ClusterTask::waiting_task(0);
+std::atomic<unsigned long> zim::writer::IndexTask::waiting_task(0);
+
+namespace zim
+{
+  namespace writer
+  {
+
+    inline unsigned int countWords(const string& text)
+    {
+      unsigned int numWords = 1;
+      unsigned int length = text.size();
+
+      for (unsigned int i = 0; i < length;) {
+        while (i < length && text[i] != ' ') {
+          i++;
+        }
+        numWords++;
+        i++;
+      }
+      return numWords;
+    }
+
+    const unsigned int keywordsBoostFactor = 3;
+    inline unsigned int getTitleBoostFactor(const unsigned int contentLength)
+    {
+      return contentLength / 500 + 1;
+    }
+
+
+    void ClusterTask::run(CreatorData* data) {
+      cluster->close();
+    };
+
+    void IndexTask::run(CreatorData* data) {
+      Xapian::Stem stemmer;
+      Xapian::TermGenerator indexer;
+      try {
+        stemmer = Xapian::Stem(data->indexer->stemmer_language);
+        indexer.set_stemmer(stemmer);
+        indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_ALL);
+      } catch (...) {
+        // No stemming for language.
+      }
+      indexer.set_stopper(&data->indexer->stopper);
+      indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL);
+
+      zim::MyHtmlParser htmlParser;
+      try {
+        htmlParser.parse_html(p_article->getData(), "UTF-8", true);
+      } catch (...) {}
+      if (htmlParser.dump.find("NOINDEX") != string::npos)
+      {
+        return;
+      }
+
+      Xapian::Document document;
+      document.set_data(p_article->getUrl().getLongUrl());
+      indexer.set_document(document);
+
+      auto title = p_article->getTitle();
+      auto normalizedTitle = zim::removeAccents(title);
+      auto keywords = zim::removeAccents(htmlParser.keywords);
+      auto content = zim::removeAccents(htmlParser.dump);
+
+      document.add_value(0, title);
+
+      std::stringstream countWordStringStream;
+      countWordStringStream << countWords(htmlParser.dump);
+      document.add_value(1, countWordStringStream.str());
+
+      if (htmlParser.has_geoPosition) {
+        auto geoPosition = Xapian::LatLongCoord(
+        htmlParser.latitude, htmlParser.longitude).serialise();
+        document.add_value(2, geoPosition);
+      }
+
+      /* Index the title */
+      if (!normalizedTitle.empty()) {
+        indexer.index_text_without_positions(
+          normalizedTitle, getTitleBoostFactor(content.size()));
+      }
+
+      /* Index the keywords */
+      if (!keywords.empty()) {
+        indexer.index_text_without_positions(keywords, keywordsBoostFactor);
+      }
+
+      /* Index the content */
+      if (!content.empty()) {
+        indexer.index_text_without_positions(content);
+      }
+
+      pthread_mutex_lock(&s_dbaccessLock);
+      data->indexer->writableDatabase.add_document(document);
+      pthread_mutex_unlock(&s_dbaccessLock);
+    }
+
+    void* taskRunner(void* arg) {
+      auto creatorData = static_cast<zim::writer::CreatorData*>(arg);
+      Task* task;
+      unsigned int wait = 0;
+
+      while(true) {
+        microsleep(wait);
+        wait += 100;
+        if (creatorData->taskList.popFromQueue(task)) {
+          if (task == nullptr) {
+            return nullptr;
+          }
+          task->run(creatorData);
+          delete task;
+          wait = 0;
+        }
+      }
+      return nullptr;
+    }
+
+    void* clusterWriter(void* arg) {
+      auto creatorData = static_cast<zim::writer::CreatorData*>(arg);
+      Cluster* cluster;
+      unsigned int wait = 0;
+      while(true) {
+        microsleep(wait);
+        wait += 100;
+        if(creatorData->clusterToWrite.getHead(cluster)) {
+          if (cluster == nullptr) {
+            // All cluster writen, we can quit
+            return nullptr;
+          }
+          if (not cluster->isClosed()) {
+            continue;
+          }
+          creatorData->clusterToWrite.popFromQueue(cluster);
+          cluster->setOffset(offset_t(lseek(creatorData->out_fd, 0, SEEK_CUR)));
+          cluster->write(creatorData->out_fd);
+          wait = 0;
+        }
+      }
+      return nullptr;
+    }
+  }
+}
diff --git a/src/writer/workers.h b/src/writer/workers.h

new file mode 100644 (file)

index 0000000..9df11b0
--- /dev/null
+++ b/src/writer/workers.h
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2016 Matthieu Gautier <mgautier@kymeria.fr>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef OPENZIM_LIBZIM_WORKER_H
+#define OPENZIM_LIBZIM_WORKER_H
+
+#include <atomic>
+
+namespace zim {
+namespace writer {
+
+class Cluster;
+class CreatorData;
+
+class Task {
+  public:
+    Task() = default;
+    virtual ~Task() = default;
+
+    virtual void run(CreatorData* data) = 0;
+};
+
+class ClusterTask : public Task {
+  public:
+    ClusterTask(Cluster* cluster) :
+      cluster(cluster)
+    {
+      ++waiting_task;
+    };
+    virtual ~ClusterTask()
+    {
+      --waiting_task;
+    }
+
+    virtual void run(CreatorData* data);
+    static std::atomic<unsigned long> waiting_task;
+
+  private:
+    Cluster* cluster;
+};
+
+class IndexTask : public Task {
+  public:
+    IndexTask(std::shared_ptr<Article> article) :
+      p_article(article)
+    {
+      ++waiting_task;
+    }
+    virtual ~IndexTask()
+    {
+      --waiting_task;
+    }
+
+    virtual void run(CreatorData* data);
+    static std::atomic<unsigned long> waiting_task;
+
+  private:
+    std::shared_ptr<Article> p_article;
+};
+
+void* taskRunner(void* data);
+void* clusterWriter(void* data);
+
+}
+}
+
+#endif // OPENZIM_LIBZIM_QUEUE_H
diff --git a/src/writer/xapianIndexer.cpp b/src/writer/xapianIndexer.cpp

new file mode 100644 (file)

index 0000000..c56b8ee
--- /dev/null
+++ b/src/writer/xapianIndexer.cpp
@@ -0,0 +1,164 @@
+/*
+ * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#include "xapianIndexer.h"
+#include "libzim-resources.h"
+#include "fs.h"
+#include "tools.h"
+#include <sstream>
+#include <fstream>
+#include <stdexcept>
+
+/* Constructor */
+XapianIndexer::XapianIndexer(const std::string& language, IndexingMode indexingMode, const bool verbose)
+    : language(language),
+      indexingMode(indexingMode)
+{
+  /* Build ICU Local object to retrieve ISO-639 language code (from
+     ISO-639-3) */
+  icu::Locale languageLocale(language.c_str());
+  stemmer_language = languageLocale.getLanguage();
+
+  /* Read the stopwords */
+  std::string stopWord;
+  try {
+    this->stopwords = getResource("stopwords/" + language);
+  } catch(ResourceNotFound& e) {}
+  std::istringstream file(this->stopwords);
+  while (std::getline(file, stopWord, '\n')) {
+    this->stopper.add(stopWord);
+  }
+}
+
+XapianIndexer::~XapianIndexer()
+{
+  if (!indexPath.empty()) {
+    try {
+#ifndef _WIN32
+//[TODO] Implement remove for windows
+      zim::DEFAULTFS::remove(indexPath + ".tmp");
+      zim::DEFAULTFS::remove(indexPath);
+#endif
+    } catch (...) {
+      /* Do not raise */
+    }
+  }
+}
+
+void XapianIndexer::indexingPrelude(const string indexPath_)
+{
+  indexPath = indexPath_;
+  writableDatabase = Xapian::WritableDatabase(indexPath + ".tmp", Xapian::DB_CREATE_OR_OVERWRITE);
+  switch (indexingMode) {
+    case IndexingMode::TITLE:
+      writableDatabase.set_metadata("valuesmap", "title:0");
+      writableDatabase.set_metadata("kind", "title");
+      break;
+    case IndexingMode::FULL:
+      writableDatabase.set_metadata("valuesmap", "title:0;wordcount:1;geo.position:2");
+      writableDatabase.set_metadata("kind", "fulltext");
+      break;
+  }
+  writableDatabase.set_metadata("language", language);
+  writableDatabase.set_metadata("stopwords", stopwords);
+  writableDatabase.begin_transaction(true);
+}
+
+void XapianIndexer::index(const zim::writer::Article* article)
+{
+  switch (indexingMode) {
+    case IndexingMode::TITLE:
+      indexTitle(article);
+      break;
+    case IndexingMode::FULL:
+      indexFull(article);
+      break;
+  }
+}
+
+
+void XapianIndexer::indexFull(const zim::writer::Article* article)
+{
+}
+
+void XapianIndexer::indexTitle(const zim::writer::Article* article)
+{
+  Xapian::Stem stemmer;
+  Xapian::TermGenerator indexer;
+  try {
+    stemmer = Xapian::Stem(stemmer_language);
+    indexer.set_stemmer(stemmer);
+    indexer.set_stemming_strategy(Xapian::TermGenerator::STEM_SOME);
+  } catch (...) {}
+  indexer.set_stopper(&stopper);
+  indexer.set_stopper_strategy(Xapian::TermGenerator::STOP_ALL);
+  Xapian::Document currentDocument;
+  currentDocument.clear_values();
+  currentDocument.set_data(article->getUrl().getLongUrl());
+  indexer.set_document(currentDocument);
+
+  std::string accentedTitle = article->getTitle();
+  std::string title = zim::removeAccents(accentedTitle);
+
+  currentDocument.add_value(0, accentedTitle);
+
+  if (!title.empty()) {
+    indexer.index_text(title, 1);
+  }
+
+  /* add to the database */
+  writableDatabase.add_document(currentDocument);
+}
+
+void XapianIndexer::flush()
+{
+  this->writableDatabase.commit_transaction();
+  this->writableDatabase.begin_transaction(true);
+}
+
+void XapianIndexer::indexingPostlude()
+{
+  this->flush();
+  this->writableDatabase.commit_transaction();
+  this->writableDatabase.commit();
+  this->writableDatabase.compact(indexPath, Xapian::DBCOMPACT_SINGLE_FILE);
+  this->writableDatabase.close();
+}
+
+XapianMetaArticle* XapianIndexer::getMetaArticle()
+{
+  return new XapianMetaArticle(this, indexingMode);
+}
+
+zim::size_type XapianMetaArticle::getSize() const
+{
+  std::ifstream in(indexer->getIndexPath(), std::ios::binary|std::ios::ate);
+  return in.tellg();
+}
+
+std::string XapianMetaArticle::getFilename() const
+{
+  return indexer->getIndexPath();
+}
+
+zim::Blob XapianMetaArticle::getData() const
+{
+  throw std::logic_error("We should not pass here.");
+  return zim::Blob();
+}
diff --git a/src/writer/xapianIndexer.h b/src/writer/xapianIndexer.h

new file mode 100644 (file)

index 0000000..c934a9c
--- /dev/null
+++ b/src/writer/xapianIndexer.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright 2011 Emmanuel Engelhart <kelson@kiwix.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU  General Public License as published by
+ * the Free Software Foundation; either version 3 of the License, or
+ * any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ * MA 02110-1301, USA.
+ */
+
+#ifndef LIBZIM_WRITER_XAPIANINDEXER_H
+#define LIBZIM_WRITER_XAPIANINDEXER_H
+
+#include <zim/article.h>
+#include <zim/writer/article.h>
+
+#include <unicode/locid.h>
+#include <xapian.h>
+#include <zim/blob.h>
+#include "xapian/myhtmlparse.h"
+
+
+namespace zim {
+  namespace writer {
+    class IndexTask;
+  }
+}
+class XapianIndexer;
+
+enum class IndexingMode {
+  TITLE,
+  FULL
+};
+
+class XapianMetaArticle : public zim::writer::Article
+{
+ private:
+  XapianIndexer* indexer;
+  IndexingMode mode;
+  mutable std::string data;
+
+ public:
+  XapianMetaArticle(XapianIndexer* indexer, IndexingMode mode) : indexer(indexer), mode(mode)
+  {}
+  virtual ~XapianMetaArticle() = default;
+  virtual zim::Blob getData() const;
+  virtual zim::writer::Url getUrl() const {
+    switch (mode) {
+      case IndexingMode::FULL:
+        return zim::writer::Url('X', "fulltext/xapian");
+      case IndexingMode::TITLE:
+        return zim::writer::Url('X', "title/xapian");
+    }
+    return zim::writer::Url();
+  }
+  virtual std::string getTitle() const {
+    switch (mode) {
+      case IndexingMode::FULL:
+        return "Xapian Fulltext Index";
+      case IndexingMode::TITLE:
+        return "Xapian Title Index";
+    }
+    return "";
+  }
+  virtual std::string getMimeType() const { return "application/octet-stream+xapian"; }
+  virtual bool isRedirect() const { return false; }
+  virtual bool shouldIndex() const { return false; }
+  virtual bool shouldCompress() const { return false; }
+  virtual zim::writer::Url getRedirectUrl() const { return zim::writer::Url(); }
+  virtual zim::size_type getSize() const;
+  virtual std::string getFilename() const;
+};
+
+class XapianIndexer
+{
+ public:
+  XapianIndexer(const std::string& language, IndexingMode mode, bool verbose);
+  virtual ~XapianIndexer();
+  std::string getIndexPath() { return indexPath; }
+  void indexingPrelude(const string indexPath);
+  void index(const zim::writer::Article* article);
+  void flush();
+  void indexingPostlude();
+  XapianMetaArticle* getMetaArticle();
+
+ protected:
+  void indexTitle(const zim::writer::Article* article);
+  void indexFull(const zim::writer::Article* article);
+
+  Xapian::WritableDatabase writableDatabase;
+  std::string stemmer_language;
+  Xapian::SimpleStopper stopper;
+  std::string indexPath;
+  std::string language;
+  std::string stopwords;
+  IndexingMode indexingMode;
+
+ friend class zim::writer::IndexTask;
+};
+
+#endif  // LIBZIM_WRITER_XAPIANINDEXER_H
diff --git a/src/xapian/htmlparse.cc b/src/xapian/htmlparse.cc

new file mode 100644 (file)

index 0000000..0f3316d
--- /dev/null
+++ b/src/xapian/htmlparse.cc
@@ -0,0 +1,377 @@
+/* htmlparse.cc: simple HTML parser for omega indexer
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2001 Ananova Ltd
+ * Copyright 2002,2006,2007,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+// #include <config.h>
+
+#include "htmlparse.h"
+
+#include <xapian.h>
+
+// #include "utf8convert.h"
+
+#include <algorithm>
+
+#include <ctype.h>
+#include <cstring>
+#include <stdio.h>
+#include <stdlib.h>
+#include <pthread.h>
+
+using namespace std;
+
+inline void
+lowercase_string(string &str)
+{
+    for (string::iterator i = str.begin(); i != str.end(); ++i) {
+       *i = tolower(static_cast<unsigned char>(*i));
+    }
+}
+
+map<string, unsigned int> zim::HtmlParser::named_ents;
+static pthread_mutex_t sInitLock = PTHREAD_MUTEX_INITIALIZER;
+
+inline static bool
+p_notdigit(char c)
+{
+    return !isdigit(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notxdigit(char c)
+{
+    return !isxdigit(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notalnum(char c)
+{
+    return !isalnum(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_notwhitespace(char c)
+{
+    return !isspace(static_cast<unsigned char>(c));
+}
+
+inline static bool
+p_nottag(char c)
+{
+    return !isalnum(static_cast<unsigned char>(c)) &&
+       c != '.' && c != '-' && c != ':'; // ':' for XML namespaces.
+}
+
+inline static bool
+p_whitespacegt(char c)
+{
+    return isspace(static_cast<unsigned char>(c)) || c == '>';
+}
+
+inline static bool
+p_whitespaceeqgt(char c)
+{
+    return isspace(static_cast<unsigned char>(c)) || c == '=' || c == '>';
+}
+
+bool
+zim::HtmlParser::get_parameter(const string & param, string & value)
+{
+    map<string, string>::const_iterator i = parameters.find(param);
+    if (i == parameters.end()) return false;
+    value = i->second;
+    return true;
+}
+
+zim::HtmlParser::HtmlParser()
+{
+    static const struct ent { const char *n; unsigned int v; } ents[] = {
+#include "namedentities.h"
+       { NULL, 0 }
+    };
+    pthread_mutex_lock(&sInitLock);
+    if (named_ents.empty()) {
+       const struct ent *i = ents;
+       while (i->n) {
+           named_ents[string(i->n)] = i->v;
+           ++i;
+       }
+    }
+    pthread_mutex_unlock(&sInitLock);
+}
+
+void
+zim::HtmlParser::decode_entities(string &s)
+{
+    // We need a const_iterator version of s.end() - otherwise the
+    // find() and find_if() templates don't work...
+    string::const_iterator amp = s.begin(), s_end = s.end();
+    while ((amp = find(amp, s_end, '&')) != s_end) {
+       unsigned int val = 0;
+       string::const_iterator end, p = amp + 1;
+       if (p != s_end && *p == '#') {
+           p++;
+           if (p != s_end && (*p == 'x' || *p == 'X')) {
+               // hex
+               p++;
+               end = find_if(p, s_end, p_notxdigit);
+               sscanf(s.substr(p - s.begin(), end - p).c_str(), "%x", &val);
+           } else {
+               // number
+               end = find_if(p, s_end, p_notdigit);
+               val = atoi(s.substr(p - s.begin(), end - p).c_str());
+           }
+       } else {
+           end = find_if(p, s_end, p_notalnum);
+           string code = s.substr(p - s.begin(), end - p);
+           map<string, unsigned int>::const_iterator i;
+           i = named_ents.find(code);
+           if (i != named_ents.end()) val = i->second;
+       }
+       if (end < s_end && *end == ';') end++;
+       if (val) {
+           string::size_type amp_pos = amp - s.begin();
+           if (val < 0x80) {
+               s.replace(amp_pos, end - amp, 1u, char(val));
+           } else {
+               // Convert unicode value val to UTF-8.
+               char seq[4];
+               unsigned len = Xapian::Unicode::nonascii_to_utf8(val, seq);
+               s.replace(amp_pos, end - amp, seq, len);
+           }
+           s_end = s.end();
+           // We've modified the string, so the iterators are no longer
+           // valid...
+           amp = s.begin() + amp_pos + 1;
+       } else {
+           amp = end;
+       }
+    }
+}
+
+void
+zim::HtmlParser::parse_html(const string &body)
+{
+    in_script = false;
+
+    parameters.clear();
+    string::const_iterator start = body.begin();
+
+    while (true) {
+       // Skip through until we find an HTML tag, a comment, or the end of
+       // document.  Ignore isolated occurrences of `<' which don't start
+       // a tag or comment.
+       string::const_iterator p = start;
+       while (true) {
+           p = find(p, body.end(), '<');
+           if (p == body.end()) break;
+           unsigned char ch = *(p + 1);
+
+           // Tag, closing tag, or comment (or SGML declaration).
+           if ((!in_script && isalpha(ch)) || ch == '/' || ch == '!') break;
+
+           if (ch == '?') {
+               // PHP code or XML declaration.
+               // XML declaration is only valid at the start of the first line.
+               // FIXME: need to deal with BOMs...
+               if (p != body.begin() || body.size() < 20) break;
+
+               // XML declaration looks something like this:
+               // <?xml version="1.0" encoding="UTF-8"?>
+               if (p[2] != 'x' || p[3] != 'm' || p[4] != 'l') break;
+               if (strchr(" \t\r\n", p[5]) == NULL) break;
+
+               string::const_iterator decl_end = find(p + 6, body.end(), '?');
+               if (decl_end == body.end()) break;
+
+               // Default charset for XML is UTF-8.
+               charset = "UTF-8";
+
+               string decl(p + 6, decl_end);
+               size_t enc = decl.find("encoding");
+               if (enc == string::npos) break;
+
+               enc = decl.find_first_not_of(" \t\r\n", enc + 8);
+               if (enc == string::npos || enc == decl.size()) break;
+
+               if (decl[enc] != '=') break;
+               
+               enc = decl.find_first_not_of(" \t\r\n", enc + 1);
+               if (enc == string::npos || enc == decl.size()) break;
+
+               if (decl[enc] != '"' && decl[enc] != '\'') break;
+
+               char quote = decl[enc++];
+               size_t enc_end = decl.find(quote, enc);
+
+               if (enc != string::npos)
+                   charset = decl.substr(enc, enc_end - enc);
+
+               break;
+           }
+           p++;
+       }
+
+       // Process text up to start of tag.
+       if (p > start) {
+           string text = body.substr(start - body.begin(), p - start);
+           // convert_to_utf8(text, charset);
+           decode_entities(text);
+           process_text(text);
+       }
+
+       if (p == body.end()) break;
+
+       start = p + 1;
+
+       if (start == body.end()) break;
+
+       if (*start == '!') {
+           if (++start == body.end()) break;
+           if (++start == body.end()) break;
+           // comment or SGML declaration
+           if (*(start - 1) == '-' && *start == '-') {
+               ++start;
+               string::const_iterator close = find(start, body.end(), '>');
+               // An unterminated comment swallows rest of document
+               // (like Netscape, but unlike MSIE IIRC)
+               if (close == body.end()) break;
+
+               p = close;
+               // look for -->
+               while (p != body.end() && (*(p - 1) != '-' || *(p - 2) != '-'))
+                   p = find(p + 1, body.end(), '>');
+
+               if (p != body.end()) {
+                   // Check for htdig's "ignore this bit" comments.
+                   if (p - start == 15 && string(start, p - 2) == "htdig_noindex") {
+                       string::size_type i;
+                       i = body.find("<!--/htdig_noindex-->", p + 1 - body.begin());
+                       if (i == string::npos) break;
+                       start = body.begin() + i + 21;
+                       continue;
+                   }
+                   // If we found --> skip to there.
+                   start = p;
+               } else {
+                   // Otherwise skip to the first > we found (as Netscape does).
+                   start = close;
+               }
+           } else {
+               // just an SGML declaration, perhaps giving the DTD - ignore it
+               start = find(start - 1, body.end(), '>');
+               if (start == body.end()) break;
+           }
+           ++start;
+       } else if (*start == '?') {
+           if (++start == body.end()) break;
+           // PHP - swallow until ?> or EOF
+           start = find(start + 1, body.end(), '>');
+
+           // look for ?>
+           while (start != body.end() && *(start - 1) != '?')
+               start = find(start + 1, body.end(), '>');
+
+           // unterminated PHP swallows rest of document (rather arbitrarily
+           // but it avoids polluting the database when things go wrong)
+           if (start != body.end()) ++start;
+       } else {
+           // opening or closing tag
+           int closing = 0;
+
+           if (*start == '/') {
+               closing = 1;
+               start = find_if(start + 1, body.end(), p_notwhitespace);
+           }
+
+           p = start;
+           start = find_if(start, body.end(), p_nottag);
+           string tag = body.substr(p - body.begin(), start - p);
+           // convert tagname to lowercase
+           lowercase_string(tag);
+
+           if (closing) {
+               closing_tag(tag);
+               if (in_script && tag == "script") in_script = false;
+
+               /* ignore any bogus parameters on closing tags */
+               p = find(start, body.end(), '>');
+               if (p == body.end()) break;
+               start = p + 1;
+           } else {
+               // FIXME: parse parameters lazily.
+               while (start < body.end() && *start != '>') {
+                   string name, value;
+
+                   p = find_if(start, body.end(), p_whitespaceeqgt);
+
+                   name.assign(body, start - body.begin(), p - start);
+
+                   p = find_if(p, body.end(), p_notwhitespace);
+
+                   start = p;
+                   if (start != body.end() && *start == '=') {
+                       start = find_if(start + 1, body.end(), p_notwhitespace);
+
+                       p = body.end();
+
+                       int quote = *start;
+                       if (quote == '"' || quote == '\'') {
+                           start++;
+                           p = find(start, body.end(), quote);
+                       }
+
+                       if (p == body.end()) {
+                           // unquoted or no closing quote
+                           p = find_if(start, body.end(), p_whitespacegt);
+                       }
+                       value.assign(body, start - body.begin(), p - start);
+                       start = find_if(p, body.end(), p_notwhitespace);
+
+                       if (!name.empty()) {
+                           // convert parameter name to lowercase
+                           lowercase_string(name);
+                           // in case of multiple entries, use the first
+                           // (as Netscape does)
+                           parameters.insert(make_pair(name, value));
+                       }
+                   }
+               }
+#if 0
+               cout << "<" << tag;
+               map<string, string>::const_iterator x;
+               for (x = parameters.begin(); x != parameters.end(); x++) {
+                   cout << " " << x->first << "=\"" << x->second << "\"";
+               }
+               cout << ">\n";
+#endif
+               opening_tag(tag);
+               parameters.clear();
+
+               // In <script> tags we ignore opening tags to avoid problems
+               // with "a<b".
+               if (tag == "script") in_script = true;
+
+               if (start != body.end() && *start == '>') ++start;
+           }
+       }
+    }
+}
diff --git a/src/xapian/htmlparse.h b/src/xapian/htmlparse.h

new file mode 100644 (file)

index 0000000..a884b2a
--- /dev/null
+++ b/src/xapian/htmlparse.h
@@ -0,0 +1,53 @@
+/* htmlparse.h: simple HTML parser for omega indexer
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2006,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+#ifndef OMEGA_INCLUDED_HTMLPARSE_H
+#define OMEGA_INCLUDED_HTMLPARSE_H
+
+#include <string>
+#include <map>
+
+using std::string;
+using std::map;
+
+namespace zim {
+
+class HtmlParser {
+       map<string, string> parameters;
+    protected:
+       void decode_entities(string &s);
+       bool in_script;
+       string charset;
+       static map<string, unsigned int> named_ents;
+
+       bool get_parameter(const string & param, string & value);
+    public:
+       virtual void process_text(const string &/*text*/) { }
+       virtual void opening_tag(const string &/*tag*/) { }
+       virtual void closing_tag(const string &/*tag*/) { }
+       virtual void parse_html(const string &text);
+       HtmlParser();
+       virtual ~HtmlParser() { }
+};
+
+};
+
+#endif // OMEGA_INCLUDED_HTMLPARSE_H
diff --git a/src/xapian/myhtmlparse.cc b/src/xapian/myhtmlparse.cc

new file mode 100644 (file)

index 0000000..c260b31
--- /dev/null
+++ b/src/xapian/myhtmlparse.cc
@@ -0,0 +1,322 @@
+/* myhtmlparse.cc: subclass of HtmlParser for extracting text.
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2003,2004,2006,2007,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+// #include <config.h>
+
+#include "myhtmlparse.h"
+
+// #include "utf8convert.h"
+
+#include <ctype.h>
+#include <string.h>
+#include <sstream>
+
+inline void
+lowercase_string(string &str)
+{
+    for (string::iterator i = str.begin(); i != str.end(); ++i) {
+       *i = tolower(static_cast<unsigned char>(*i));
+    }
+}
+
+void
+zim::MyHtmlParser::parse_html(const string &text, const string &charset_,
+                        bool charset_from_meta_)
+{
+    charset = charset_;
+    charset_from_meta = charset_from_meta_;
+    HtmlParser::parse_html(text);
+}
+
+void
+zim::MyHtmlParser::process_text(const string &text)
+{
+    if (!text.empty() && !in_script_tag && !in_style_tag) {
+       string::size_type b = text.find_first_not_of(WHITESPACE);
+       if (b) pending_space = true;
+       while (b != string::npos) {
+           if (pending_space && !dump.empty()) dump += ' ';
+           string::size_type e = text.find_first_of(WHITESPACE, b);
+           pending_space = (e != string::npos);
+           if (!pending_space) {
+               dump.append(text.data() + b, text.size() - b);
+               return;
+           }
+           dump.append(text.data() + b, e - b);
+           b = text.find_first_not_of(WHITESPACE, e + 1);
+       }
+    }
+}
+
+inline float _stof(std::string str){
+    std::istringstream stream(str);
+    float ret;
+    stream >> ret;
+    return ret;
+}
+
+void
+zim::MyHtmlParser::opening_tag(const string &tag)
+{
+    if (tag.empty()) return;
+    switch (tag[0]) {
+       case 'a':
+           if (tag == "address") pending_space = true;
+           break;
+       case 'b':
+           if (tag == "body") {
+               dump.resize(0);
+               break;
+           }
+           if (tag == "blockquote" || tag == "br") pending_space = true;
+           break;
+       case 'c':
+           if (tag == "center") pending_space = true;
+           break;
+       case 'd':
+           if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
+               tag == "dt") pending_space = true;
+           break;
+       case 'e':
+           if (tag == "embed") pending_space = true;
+           break;
+       case 'f':
+           if (tag == "fieldset" || tag == "form") pending_space = true;
+           break;
+       case 'h':
+           // hr, and h1, ..., h6
+           if (tag.length() == 2 && strchr("r123456", tag[1]))
+               pending_space = true;
+           break;
+       case 'i':
+           if (tag == "iframe" || tag == "img" || tag == "isindex" ||
+               tag == "input") pending_space = true;
+           break;
+       case 'k':
+           if (tag == "keygen") pending_space = true;
+           break;
+       case 'l':
+           if (tag == "legend" || tag == "li" || tag == "listing")
+               pending_space = true;
+           break;
+       case 'm':
+           if (tag == "meta") {
+               string content;
+               if (get_parameter("content", content)) {
+                   string name;
+                   if (get_parameter("name", name)) {
+                       lowercase_string(name);
+                       if (name == "description") {
+                           if (sample.empty()) {
+                               swap(sample, content);
+                               // convert_to_utf8(sample, charset);
+                               decode_entities(sample);
+                           }
+                       } else if (name == "keywords") {
+                           if (!keywords.empty()) keywords += ' ';
+                           // convert_to_utf8(content, charset);
+                           decode_entities(content);
+                           keywords += content;
+                       } else if (name == "robots") {
+                           decode_entities(content);
+                           lowercase_string(content);
+                           if (content.find("none") != string::npos ||
+                               content.find("noindex") != string::npos) {
+                               indexing_allowed = false;
+                               throw true;
+                           }
+                       } 
+                        else if (name == "geo.position") {
+                           auto sep_pos = content.find(";");
+                           if (sep_pos != string::npos) {
+                               try {
+                                   latitude = _stof(content.substr(0, sep_pos));
+                                   longitude = _stof(content.substr(sep_pos+1));
+                                   has_geoPosition = true;
+                               } catch (...) {
+                                   //invalid value in content, just pass and continue.
+                               }
+                           }
+                       }
+                       break;
+                   }
+                   // If the current charset came from a meta tag, don't
+                   // force reparsing again!
+                   if (charset_from_meta) break;
+                   string hdr;
+                   if (get_parameter("http-equiv", hdr)) {
+                       lowercase_string(hdr);
+                       if (hdr == "content-type") {
+                           lowercase_string(content);
+                           size_t start = content.find("charset=");
+                           if (start == string::npos) break;
+                           start += 8;
+                           if (start == content.size()) break;
+                           size_t end = start;
+                           if (content[start] != '"') {
+                               while (end < content.size()) {
+                                   unsigned char ch = content[end];
+                                   if (ch <= 32 || ch >= 127 ||
+                                       strchr(";()<>@,:\\\"/[]?={}", ch))
+                                       break;
+                                   ++end;
+                               }
+                           } else {
+                               ++start;
+                               ++end;
+                               while (end < content.size()) {
+                                   unsigned char ch = content[end];
+                                   if (ch == '"') break;
+                                   if (ch == '\\') content.erase(end, 1);
+                                   ++end;
+                               }
+                           }
+                           string newcharset(content, start, end - start);
+                           if (charset != newcharset) {
+                               throw newcharset;
+                           }
+                       }
+                   }
+                   break;
+               }
+               if (charset_from_meta) break;
+               string newcharset;
+               if (get_parameter("charset", newcharset)) {
+                   // HTML5 added: <meta charset="...">
+                   lowercase_string(newcharset);
+                   if (charset != newcharset) {
+                       throw newcharset;
+                   }
+               }
+               break;
+           }
+           if (tag == "marquee" || tag == "menu" || tag == "multicol")
+               pending_space = true;
+           break;
+       case 'o':
+           if (tag == "ol" || tag == "option") pending_space = true;
+           break;
+       case 'p':
+           if (tag == "p" || tag == "pre" || tag == "plaintext")
+               pending_space = true;
+           break;
+       case 'q':
+           if (tag == "q") pending_space = true;
+           break;
+       case 's':
+           if (tag == "style") {
+               in_style_tag = true;
+               break;
+           }
+           if (tag == "script") {
+               in_script_tag = true;
+               break;
+           }
+           if (tag == "select") pending_space = true;
+           break;
+       case 't':
+           if (tag == "table" || tag == "td" || tag == "textarea" ||
+               tag == "th") pending_space = true;
+           break;
+       case 'u':
+           if (tag == "ul") pending_space = true;
+           break;
+       case 'x':
+           if (tag == "xmp") pending_space = true;
+           break;
+    }
+}
+
+void
+zim::MyHtmlParser::closing_tag(const string &tag)
+{
+    if (tag.empty()) return;
+    switch (tag[0]) {
+       case 'a':
+           if (tag == "address") pending_space = true;
+           break;
+       case 'b':
+           if (tag == "body") {
+               throw true;
+           }
+           if (tag == "blockquote" || tag == "br") pending_space = true;
+           break;
+       case 'c':
+           if (tag == "center") pending_space = true;
+           break;
+       case 'd':
+           if (tag == "dd" || tag == "dir" || tag == "div" || tag == "dl" ||
+               tag == "dt") pending_space = true;
+           break;
+       case 'f':
+           if (tag == "fieldset" || tag == "form") pending_space = true;
+           break;
+       case 'h':
+           // hr, and h1, ..., h6
+           if (tag.length() == 2 && strchr("r123456", tag[1]))
+               pending_space = true;
+           break;
+       case 'i':
+           if (tag == "iframe") pending_space = true;
+           break;
+       case 'l':
+           if (tag == "legend" || tag == "li" || tag == "listing")
+               pending_space = true;
+           break;
+       case 'm':
+           if (tag == "marquee" || tag == "menu") pending_space = true;
+           break;
+       case 'o':
+           if (tag == "ol" || tag == "option") pending_space = true;
+           break;
+       case 'p':
+           if (tag == "p" || tag == "pre") pending_space = true;
+           break;
+       case 'q':
+           if (tag == "q") pending_space = true;
+           break;
+       case 's':
+           if (tag == "style") {
+               in_style_tag = false;
+               break;
+           }
+           if (tag == "script") {
+               in_script_tag = false;
+               break;
+           }
+           if (tag == "select") pending_space = true;
+           break;
+       case 't':
+           if (tag == "title") {
+               if (title.empty()) swap(title, dump);
+               break;
+           }
+           if (tag == "table" || tag == "td" || tag == "textarea" ||
+               tag == "th") pending_space = true;
+           break;
+       case 'u':
+           if (tag == "ul") pending_space = true;
+           break;
+       case 'x':
+           if (tag == "xmp") pending_space = true;
+           break;
+    }
+}
diff --git a/src/xapian/myhtmlparse.h b/src/xapian/myhtmlparse.h

new file mode 100644 (file)

index 0000000..a1f2101
--- /dev/null
+++ b/src/xapian/myhtmlparse.h
@@ -0,0 +1,75 @@
+/* myhtmlparse.h: subclass of HtmlParser for extracting text
+ *
+ * Copyright 1999,2000,2001 BrightStation PLC
+ * Copyright 2002,2003,2004,2006,2008 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301
+ * USA
+ */
+
+#ifndef OMEGA_INCLUDED_MYHTMLPARSE_H
+#define OMEGA_INCLUDED_MYHTMLPARSE_H
+
+#include "htmlparse.h"
+
+// FIXME: Should we include \xa0 which is non-breaking space in iso-8859-1, but
+// not in all charsets and perhaps spans of all \xa0 should become a single
+// \xa0?
+#define WHITESPACE " \t\n\r"
+
+namespace zim {
+
+class MyHtmlParser : public HtmlParser {
+    public:
+       bool in_script_tag;
+       bool in_style_tag;
+       bool pending_space;
+       bool indexing_allowed;
+       bool charset_from_meta;
+    float latitude, longitude;
+    bool has_geoPosition;
+       string title, sample, keywords, dump;
+       void process_text(const string &text);
+       void opening_tag(const string &tag);
+       void closing_tag(const string &tag);
+       using HtmlParser::parse_html;
+       void parse_html(const string &text, const string &charset_,
+                       bool charset_from_meta_);
+       MyHtmlParser() :
+               in_script_tag(false),
+               in_style_tag(false),
+               pending_space(false),
+               indexing_allowed(true),
+               charset_from_meta(false),
+        latitude(0), longitude(0), has_geoPosition(false) { }
+
+       void reset() {
+           in_script_tag = false;
+           in_style_tag = false;
+           pending_space = false;
+           indexing_allowed = true;
+           charset_from_meta = false;
+        latitude = longitude = 0;
+        has_geoPosition = false;
+           title.resize(0);
+           sample.resize(0);
+           keywords.resize(0);
+           dump.resize(0);
+       }
+};
+
+};
+
+#endif // OMEGA_INCLUDED_MYHTMLPARSE_H
diff --git a/src/xapian/namedentities.h b/src/xapian/namedentities.h

new file mode 100644 (file)

index 0000000..8b7f03e
--- /dev/null
+++ b/src/xapian/namedentities.h
@@ -0,0 +1,279 @@
+/* namedentities.h: named HTML entities.
+ *
+ * Copyright (C) 2006,2007 Olly Betts
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ */
+
+#ifndef OMEGA_INCLUDED_NAMEDENTITIES_H
+#define OMEGA_INCLUDED_NAMEDENTITIES_H
+
+// Names and values from: "Character entity references in HTML 4"
+// http://www.w3.org/TR/html4/sgml/entities.html
+{ "quot", 34 },
+{ "amp", 38 },
+{ "apos", 39 }, // Not in HTML 4 list but used in OpenOffice XML.
+{ "lt", 60 },
+{ "gt", 62 },
+{ "nbsp", 160 },
+{ "iexcl", 161 },
+{ "cent", 162 },
+{ "pound", 163 },
+{ "curren", 164 },
+{ "yen", 165 },
+{ "brvbar", 166 },
+{ "sect", 167 },
+{ "uml", 168 },
+{ "copy", 169 },
+{ "ordf", 170 },
+{ "laquo", 171 },
+{ "not", 172 },
+{ "shy", 173 },
+{ "reg", 174 },
+{ "macr", 175 },
+{ "deg", 176 },
+{ "plusmn", 177 },
+{ "sup2", 178 },
+{ "sup3", 179 },
+{ "acute", 180 },
+{ "micro", 181 },
+{ "para", 182 },
+{ "middot", 183 },
+{ "cedil", 184 },
+{ "sup1", 185 },
+{ "ordm", 186 },
+{ "raquo", 187 },
+{ "frac14", 188 },
+{ "frac12", 189 },
+{ "frac34", 190 },
+{ "iquest", 191 },
+{ "Agrave", 192 },
+{ "Aacute", 193 },
+{ "Acirc", 194 },
+{ "Atilde", 195 },
+{ "Auml", 196 },
+{ "Aring", 197 },
+{ "AElig", 198 },
+{ "Ccedil", 199 },
+{ "Egrave", 200 },
+{ "Eacute", 201 },
+{ "Ecirc", 202 },
+{ "Euml", 203 },
+{ "Igrave", 204 },
+{ "Iacute", 205 },
+{ "Icirc", 206 },
+{ "Iuml", 207 },
+{ "ETH", 208 },
+{ "Ntilde", 209 },
+{ "Ograve", 210 },
+{ "Oacute", 211 },
+{ "Ocirc", 212 },
+{ "Otilde", 213 },
+{ "Ouml", 214 },
+{ "times", 215 },
+{ "Oslash", 216 },
+{ "Ugrave", 217 },
+{ "Uacute", 218 },
+{ "Ucirc", 219 },
+{ "Uuml", 220 },
+{ "Yacute", 221 },
+{ "THORN", 222 },
+{ "szlig", 223 },
+{ "agrave", 224 },
+{ "aacute", 225 },
+{ "acirc", 226 },
+{ "atilde", 227 },
+{ "auml", 228 },
+{ "aring", 229 },
+{ "aelig", 230 },
+{ "ccedil", 231 },
+{ "egrave", 232 },
+{ "eacute", 233 },
+{ "ecirc", 234 },
+{ "euml", 235 },
+{ "igrave", 236 },
+{ "iacute", 237 },
+{ "icirc", 238 },
+{ "iuml", 239 },
+{ "eth", 240 },
+{ "ntilde", 241 },
+{ "ograve", 242 },
+{ "oacute", 243 },
+{ "ocirc", 244 },
+{ "otilde", 245 },
+{ "ouml", 246 },
+{ "divide", 247 },
+{ "oslash", 248 },
+{ "ugrave", 249 },
+{ "uacute", 250 },
+{ "ucirc", 251 },
+{ "uuml", 252 },
+{ "yacute", 253 },
+{ "thorn", 254 },
+{ "yuml", 255 },
+{ "OElig", 338 },
+{ "oelig", 339 },
+{ "Scaron", 352 },
+{ "scaron", 353 },
+{ "Yuml", 376 },
+{ "fnof", 402 },
+{ "circ", 710 },
+{ "tilde", 732 },
+{ "Alpha", 913 },
+{ "Beta", 914 },
+{ "Gamma", 915 },
+{ "Delta", 916 },
+{ "Epsilon", 917 },
+{ "Zeta", 918 },
+{ "Eta", 919 },
+{ "Theta", 920 },
+{ "Iota", 921 },
+{ "Kappa", 922 },
+{ "Lambda", 923 },
+{ "Mu", 924 },
+{ "Nu", 925 },
+{ "Xi", 926 },
+{ "Omicron", 927 },
+{ "Pi", 928 },
+{ "Rho", 929 },
+{ "Sigma", 931 },
+{ "Tau", 932 },
+{ "Upsilon", 933 },
+{ "Phi", 934 },
+{ "Chi", 935 },
+{ "Psi", 936 },
+{ "Omega", 937 },
+{ "alpha", 945 },
+{ "beta", 946 },
+{ "gamma", 947 },
+{ "delta", 948 },
+{ "epsilon", 949 },
+{ "zeta", 950 },
+{ "eta", 951 },
+{ "theta", 952 },
+{ "iota", 953 },
+{ "kappa", 954 },
+{ "lambda", 955 },
+{ "mu", 956 },
+{ "nu", 957 },
+{ "xi", 958 },
+{ "omicron", 959 },
+{ "pi", 960 },
+{ "rho", 961 },
+{ "sigmaf", 962 },
+{ "sigma", 963 },
+{ "tau", 964 },
+{ "upsilon", 965 },
+{ "phi", 966 },
+{ "chi", 967 },
+{ "psi", 968 },
+{ "omega", 969 },
+{ "thetasym", 977 },
+{ "upsih", 978 },
+{ "piv", 982 },
+{ "ensp", 8194 },
+{ "emsp", 8195 },
+{ "thinsp", 8201 },
+{ "zwnj", 8204 },
+{ "zwj", 8205 },
+{ "lrm", 8206 },
+{ "rlm", 8207 },
+{ "ndash", 8211 },
+{ "mdash", 8212 },
+{ "lsquo", 8216 },
+{ "rsquo", 8217 },
+{ "sbquo", 8218 },
+{ "ldquo", 8220 },
+{ "rdquo", 8221 },
+{ "bdquo", 8222 },
+{ "dagger", 8224 },
+{ "Dagger", 8225 },
+{ "bull", 8226 },
+{ "hellip", 8230 },
+{ "permil", 8240 },
+{ "prime", 8242 },
+{ "Prime", 8243 },
+{ "lsaquo", 8249 },
+{ "rsaquo", 8250 },
+{ "oline", 8254 },
+{ "frasl", 8260 },
+{ "euro", 8364 },
+{ "image", 8465 },
+{ "weierp", 8472 },
+{ "real", 8476 },
+{ "trade", 8482 },
+{ "alefsym", 8501 },
+{ "larr", 8592 },
+{ "uarr", 8593 },
+{ "rarr", 8594 },
+{ "darr", 8595 },
+{ "harr", 8596 },
+{ "crarr", 8629 },
+{ "lArr", 8656 },
+{ "uArr", 8657 },
+{ "rArr", 8658 },
+{ "dArr", 8659 },
+{ "hArr", 8660 },
+{ "forall", 8704 },
+{ "part", 8706 },
+{ "exist", 8707 },
+{ "empty", 8709 },
+{ "nabla", 8711 },
+{ "isin", 8712 },
+{ "notin", 8713 },
+{ "ni", 8715 },
+{ "prod", 8719 },
+{ "sum", 8721 },
+{ "minus", 8722 },
+{ "lowast", 8727 },
+{ "radic", 8730 },
+{ "prop", 8733 },
+{ "infin", 8734 },
+{ "ang", 8736 },
+{ "and", 8743 },
+{ "or", 8744 },
+{ "cap", 8745 },
+{ "cup", 8746 },
+{ "int", 8747 },
+{ "there4", 8756 },
+{ "sim", 8764 },
+{ "cong", 8773 },
+{ "asymp", 8776 },
+{ "ne", 8800 },
+{ "equiv", 8801 },
+{ "le", 8804 },
+{ "ge", 8805 },
+{ "sub", 8834 },
+{ "sup", 8835 },
+{ "nsub", 8836 },
+{ "sube", 8838 },
+{ "supe", 8839 },
+{ "oplus", 8853 },
+{ "otimes", 8855 },
+{ "perp", 8869 },
+{ "sdot", 8901 },
+{ "lceil", 8968 },
+{ "rceil", 8969 },
+{ "lfloor", 8970 },
+{ "rfloor", 8971 },
+{ "lang", 9001 },
+{ "rang", 9002 },
+{ "loz", 9674 },
+{ "spades", 9824 },
+{ "clubs", 9827 },
+{ "hearts", 9829 },
+{ "diams", 9830 },
+
+#endif // OMEGA_INCLUDED_NAMEDENTITIES_H
diff --git a/src/zim_types.h b/src/zim_types.h

new file mode 100644 (file)

index 0000000..3602d6d
--- /dev/null
+++ b/src/zim_types.h
@@ -0,0 +1,105 @@
+
+
+#ifndef ZIM_TYPES_H
+#define ZIM_TYPES_H
+
+#include <zim/zim.h>
+
+#include <ostream>
+
+template<typename B>
+struct REAL_TYPEDEF{
+  typedef B base_type;
+  B v;
+  REAL_TYPEDEF() : v(0) {};
+  explicit REAL_TYPEDEF(B v) : v(v) {};
+  explicit inline operator bool() const { return v != 0; }
+  explicit inline operator B() const { return v; }
+
+  inline bool operator==(const REAL_TYPEDEF<B>& rhs)
+  { return v == rhs.v; }
+};
+
+template<typename T> inline T& operator+= (T& lhs, const T& rhs)
+{
+  lhs.v += rhs.v;
+  return lhs;
+}
+
+template<typename T> inline T& operator+= (T& lhs, const typename T::base_type& rhs)
+{
+  lhs.v += rhs;
+  return lhs;
+}
+
+template<typename T> inline T operator+(T lhs, const T& rhs)
+{
+  lhs += rhs;
+  return lhs;
+}
+
+template<typename T> inline T& operator-=(T& lhs, const T& rhs)
+{
+  lhs.v -= rhs.v;
+  return lhs;
+}
+
+template<typename T> inline T operator-(T lhs, const T& rhs)
+{
+  lhs -= rhs;
+  return lhs;
+}
+
+template<typename T> inline bool operator< (const T& lhs, const T& rhs)
+{ return lhs.v < rhs.v; }
+
+template<typename T> inline bool operator> (const T& lhs, const T& rhs)
+{ return rhs < lhs; }
+
+template<typename T> inline bool operator<=(const T& lhs, const T& rhs)
+{ return !(lhs > rhs); }
+
+template<typename T> inline bool operator>=(const T& lhs, const T& rhs)
+{ return !(lhs < rhs); }
+
+template<typename T> inline bool operator!=(const T& lhs, const T& rhs)
+{ return !(lhs == rhs); }
+
+
+template<typename B>
+std::ostream& operator<<(std::ostream& os, const REAL_TYPEDEF<B>& obj)
+{
+    os << obj.v;
+    return os;
+}
+
+namespace zim {
+
+#define TYPEDEF(NAME, TYPE) struct NAME : public REAL_TYPEDEF<TYPE> { \
+explicit NAME(TYPE v=0) : REAL_TYPEDEF<TYPE>(v) {}; }; \
+static_assert(sizeof(NAME) == sizeof(TYPE), "");
+
+TYPEDEF(article_index_t, article_index_type)
+TYPEDEF(cluster_index_t, cluster_index_type)
+TYPEDEF(blob_index_t, blob_index_type)
+
+TYPEDEF(zsize_t, size_type)
+TYPEDEF(offset_t, offset_type)
+
+#undef TYPEDEF
+
+inline offset_t& operator+= (offset_t& lhs, const zsize_t& rhs)
+{
+  lhs.v += rhs.v;
+  return lhs;
+}
+
+inline offset_t operator+(offset_t lhs, const zsize_t& rhs)
+{
+  lhs += rhs;
+  return lhs;
+}
+
+};
+
+#endif //ZIM_TYPES_H
diff --git a/static/meson.build b/static/meson.build

new file mode 100644 (file)

index 0000000..b6c1d7f
--- /dev/null
+++ b/static/meson.build
@@ -0,0 +1,12 @@
+
+resources_list = 'resources_list.txt'
+
+lib_resources = custom_target('resources',
+  input: resources_list,
+  output: ['libzim-resources.cpp', 'libzim-resources.h'],
+  command:[res_compiler,
+           '--cxxfile', '@OUTPUT0@',
+           '--hfile', '@OUTPUT1@',
+           '--source_dir', '@OUTDIR@',
+           '@INPUT@']
+)
diff --git a/static/resources_list.txt b/static/resources_list.txt

new file mode 100644 (file)

index 0000000..0e2cb68
--- /dev/null
+++ b/static/resources_list.txt
@@ -0,0 +1,57 @@
+stopwords/af
+stopwords/ar
+stopwords/bg
+stopwords/bn
+stopwords/br
+stopwords/ca
+stopwords/cs
+stopwords/da
+stopwords/de
+stopwords/el
+stopwords/en
+stopwords/eo
+stopwords/es
+stopwords/et
+stopwords/eu
+stopwords/fa
+stopwords/fi
+stopwords/fr
+stopwords/ga
+stopwords/gl
+stopwords/ha
+stopwords/he
+stopwords/hi
+stopwords/hr
+stopwords/hu
+stopwords/hy
+stopwords/id
+stopwords/it
+stopwords/ja
+stopwords/ko
+stopwords/ku
+stopwords/la
+stopwords/lt
+stopwords/lv
+stopwords/mr
+stopwords/ms
+stopwords/nl
+stopwords/no
+stopwords/pl
+stopwords/pt
+stopwords/ro
+stopwords/ru
+stopwords/sk
+stopwords/sl
+stopwords/so
+stopwords/st
+stopwords/sv
+stopwords/sw
+stopwords/th
+stopwords/tl
+stopwords/tr
+stopwords/uk
+stopwords/ur
+stopwords/vi
+stopwords/yo
+stopwords/zh
+stopwords/zu
+\ No newline at end of file
diff --git a/static/stopwords/af b/static/stopwords/af

new file mode 100644 (file)

index 0000000..b13c76d
--- /dev/null
+++ b/static/stopwords/af
@@ -0,0 +1,51 @@
+'n
+aan
+af
+al
+as
+baie
+by
+daar
+dag
+dat
+die
+dit
+een
+ek
+en
+gaan
+gesê
+haar
+het
+hom
+hulle
+hy
+in
+is
+jou
+jy
+kan
+kom
+ma
+maar
+met
+my
+na
+nie
+om
+ons
+op
+saam
+sal
+se
+sien
+so
+sy
+te
+toe
+uit
+van
+vir
+was
+wat
+ŉ
+\ No newline at end of file
diff --git a/static/stopwords/ar b/static/stopwords/ar

new file mode 100644 (file)

index 0000000..81173e6
--- /dev/null
+++ b/static/stopwords/ar
@@ -0,0 +1,480 @@
+،
+آض
+آمينَ
+آه
+آهاً
+آي
+أ
+أب
+أجل
+أجمع
+أخ
+أخذ
+أصبح
+أضحى
+أقبل
+أقل
+أكثر
+ألا
+أم
+أما
+أمامك
+أمامكَ
+أمسى
+أمّا
+أن
+أنا
+أنت
+أنتم
+أنتما
+أنتن
+أنتِ
+أنشأ
+أنّى
+أو
+أوشك
+أولئك
+أولئكم
+أولاء
+أولالك
+أوّهْ
+أي
+أيا
+أين
+أينما
+أيّ
+أَنَّ
+أََيُّ
+أُفٍّ
+إذ
+إذا
+إذاً
+إذما
+إذن
+إلى
+إليكم
+إليكما
+إليكنّ
+إليكَ
+إلَيْكَ
+إلّا
+إمّا
+إن
+إنّما
+إي
+إياك
+إياكم
+إياكما
+إياكن
+إيانا
+إياه
+إياها
+إياهم
+إياهما
+إياهن
+إياي
+إيهٍ
+إِنَّ
+ا
+ابتدأ
+اثر
+اجل
+احد
+اخرى
+اخلولق
+اذا
+اربعة
+ارتدّ
+استحال
+اطار
+اعادة
+اعلنت
+اف
+اكثر
+اكد
+الألاء
+الألى
+الا
+الاخيرة
+الان
+الاول
+الاولى
+التى
+التي
+الثاني
+الثانية
+الذاتي
+الذى
+الذي
+الذين
+السابق
+الف
+اللائي
+اللاتي
+اللتان
+اللتيا
+اللتين
+اللذان
+اللذين
+اللواتي
+الماضي
+المقبل
+الوقت
+الى
+اليوم
+اما
+امام
+امس
+ان
+انبرى
+انقلب
+انه
+انها
+او
+اول
+اي
+ايار
+ايام
+ايضا
+ب
+بات
+باسم
+بان
+بخٍ
+برس
+بسبب
+بسّ
+بشكل
+بضع
+بطآن
+بعد
+بعض
+بك
+بكم
+بكما
+بكن
+بل
+بلى
+بما
+بماذا
+بمن
+بن
+بنا
+به
+بها
+بي
+بيد
+بين
+بَسْ
+بَلْهَ
+بِئْسَ
+تانِ
+تانِك
+تبدّل
+تجاه
+تحوّل
+تلقاء
+تلك
+تلكم
+تلكما
+تم
+تينك
+تَيْنِ
+تِه
+تِي
+ثلاثة
+ثم
+ثمّ
+ثمّة
+ثُمَّ
+جعل
+جلل
+جميع
+جير
+حار
+حاشا
+حاليا
+حاي
+حتى
+حرى
+حسب
+حم
+حوالى
+حول
+حيث
+حيثما
+حين
+حيَّ
+حَبَّذَا
+حَتَّى
+حَذارِ
+خلا
+خلال
+دون
+دونك
+ذا
+ذات
+ذاك
+ذانك
+ذانِ
+ذلك
+ذلكم
+ذلكما
+ذلكن
+ذو
+ذوا
+ذواتا
+ذواتي
+ذيت
+ذينك
+ذَيْنِ
+ذِه
+ذِي
+راح
+رجع
+رويدك
+ريث
+رُبَّ
+زيارة
+سبحان
+سرعان
+سنة
+سنوات
+سوف
+سوى
+سَاءَ
+سَاءَمَا
+شبه
+شخصا
+شرع
+شَتَّانَ
+صار
+صباح
+صفر
+صهٍ
+صهْ
+ضد
+ضمن
+طاق
+طالما
+طفق
+طَق
+ظلّ
+عاد
+عام
+عاما
+عامة
+عدا
+عدة
+عدد
+عدم
+عسى
+عشر
+عشرة
+علق
+على
+عليك
+عليه
+عليها
+علًّ
+عن
+عند
+عندما
+عوض
+عين
+عَدَسْ
+عَمَّا
+غدا
+غير
+ـ
+ف
+فان
+فلان
+فو
+فى
+في
+فيم
+فيما
+فيه
+فيها
+قال
+قام
+قبل
+قد
+قطّ
+قلما
+قوة
+كأنّما
+كأين
+كأيّ
+كأيّن
+كاد
+كان
+كانت
+كذا
+كذلك
+كرب
+كل
+كلا
+كلاهما
+كلتا
+كلم
+كليكما
+كليهما
+كلّما
+كلَّا
+كم
+كما
+كي
+كيت
+كيف
+كيفما
+كَأَنَّ
+كِخ
+لئن
+لا
+لات
+لاسيما
+لدن
+لدى
+لعمر
+لقاء
+لك
+لكم
+لكما
+لكن
+لكنَّما
+لكي
+لكيلا
+للامم
+لم
+لما
+لمّا
+لن
+لنا
+له
+لها
+لو
+لوكالة
+لولا
+لوما
+لي
+لَسْتَ
+لَسْتُ
+لَسْتُم
+لَسْتُمَا
+لَسْتُنَّ
+لَسْتِ
+لَسْنَ
+لَعَلَّ
+لَكِنَّ
+لَيْتَ
+لَيْسَ
+لَيْسَا
+لَيْسَتَا
+لَيْسَتْ
+لَيْسُوا
+لَِسْنَا
+ما
+ماانفك
+مابرح
+مادام
+ماذا
+مازال
+مافتئ
+مايو
+متى
+مثل
+مذ
+مساء
+مع
+معاذ
+مقابل
+مكانكم
+مكانكما
+مكانكنّ
+مكانَك
+مليار
+مليون
+مما
+ممن
+من
+منذ
+منها
+مه
+مهما
+مَنْ
+مِن
+نحن
+نحو
+نعم
+نفس
+نفسه
+نهاية
+نَخْ
+نِعِمّا
+نِعْمَ
+ها
+هاؤم
+هاكَ
+هاهنا
+هبّ
+هذا
+هذه
+هكذا
+هل
+هلمَّ
+هلّا
+هم
+هما
+هن
+هنا
+هناك
+هنالك
+هو
+هي
+هيا
+هيت
+هيّا
+هَؤلاء
+هَاتانِ
+هَاتَيْنِ
+هَاتِه
+هَاتِي
+هَجْ
+هَذا
+هَذانِ
+هَذَيْنِ
+هَذِه
+هَذِي
+هَيْهَاتَ
+و
+و6
+وا
+واحد
+واضاف
+واضافت
+واكد
+وان
+واهاً
+واوضح
+وراءَك
+وفي
+وقال
+وقالت
+وقد
+وقف
+وكان
+وكانت
+ولا
+ولم
+ومن
+وهو
+وهي
+ويكأنّ
+وَيْ
+وُشْكَانََ
+يكون
+يمكن
+يوم
+ّأيّان
+\ No newline at end of file
diff --git a/static/stopwords/bg b/static/stopwords/bg

new file mode 100644 (file)

index 0000000..2d25b3e
--- /dev/null
+++ b/static/stopwords/bg
@@ -0,0 +1,518 @@
+ð°
+ð°ð²ñ‚ðµð½ñ‚ð¸ñ‡ðµð½
+ð°ð·
+ð°ðºð¾
+ð°ð»ð°
+ð±ðµ
+ð±ðµð·
+ð±ðµñˆðµ
+ð±ð¸
+ð±ð¸ð²ñˆ
+ð±ð¸ð²ñˆð°
+ð±ð¸ð²ñˆð¾
+ð±ð¸ð»
+ð±ð¸ð»ð°
+ð±ð¸ð»ð¸
+ð±ð¸ð»ð¾
+ð±ð»ð°ð³ð¾ð´ð°ñ€ñ\8f
+ð±ð»ð¸ð·ð¾
+ð±ñ\8fñ…ð°
+ð±ñšð´ð°ñ‚
+ð±ñšð´ðµ
+ð²
+ð²ð°ñ\81
+ð²ð°ñˆ
+ð²ð°ñˆð°
+ð²ðµñ‡ðµ
+ð²ðµñ€ð¾ñ\8fñ‚ð½ð¾
+ð²ð·ðµð¼ð°
+ð²ð¸
+ð²ð¸ðµ
+ð²ð¸ð½ð°ð³ð¸
+ð²ð½ð¸ð¼ð°ð²ð°
+ð²ñ\81ðµ
+ð²ñ\81ðµðºð¸
+ð²ñ\81ð¸ñ‡ðºð¸
+ð²ñ\81ð¸ñ‡ðºð¾
+ð²ñ\81ñ\8fðºð°
+ð²ñšð²
+ð²ñšð¿ñ€ðµðºð¸
+ð²ñšñ€ñ…ñƒ
+ð²ñ€ðµð¼ðµ
+ð³
+ð³ð¸
+ð³ð»ð°ð²ðµð½
+ð³ð»ð°ð²ð½ð°
+ð³ð»ð°ð²ð½ð¾
+ð³ð»ð°ñ\81
+ð³ð¾
+ð³ð¾ð´ð¸ð½ð°
+ð³ð¾ð´ð¸ð½ð¸
+ð³ð¾ð´ð¸ñˆðµð½
+ð´
+ð´ð°
+ð´ð°ð»ð¸
+ð´ð²ð°
+ð´ð²ð°ð¼ð°
+ð´ð²ð°ð¼ð°ñ‚ð°
+ð´ð²ðµ
+ð´ð²ðµñ‚ðµ
+ð´ðµð½
+ð´ð½ðµñ\81
+ð´ð½ð¸
+ð´ð¾
+ð´ð¾ð±ñšñ€
+ð´ð¾ð±ñ€ð°
+ð´ð¾ð±ñ€ðµ
+ð´ð¾ð±ñ€ð¾
+ð´ð¾ðºð°ñ‚ð¾
+ð´ð¾ðºð¾ð³ð°
+ð´ð¾ñ\81ðµð³ð°
+ð´ð¾ñ\81ñ‚ð°
+ð´ð¾ñ€ð¸
+ð´ñ€ñƒð³
+ð´ñ€ñƒð³ð°
+ð´ñ€ñƒð³ð¸
+ðµ
+ðµð²ñ‚ð¸ð½
+ðµð´ð²ð°
+ðµð´ð¸ð½
+ðµð´ð½ð°
+ðµð´ð½ð°ðºð²ð°
+ðµð´ð½ð°ðºð²ð¸
+ðµð´ð½ð°ðºñšð²
+ðµð´ð½ð¾
+ðµðºð¸ð¿
+ðµñ‚ð¾
+ð¶ð¸ð²ð¾ñ‚
+ð·ð°
+ð·ð°ð±ð°ð²ñ\8fð¼
+ð·ð°ð´
+ð·ð°ðµð´ð½ð¾
+ð·ð°ñ\81ðµð³ð°
+ð·ð°ñ\81ð¿ð°ð»
+ð·ð°ñ‚ð¾ð²ð°
+ð·ð°ñ‰ð¾
+ð·ð°ñ‰ð¾ñ‚ð¾
+ð·ð°ñ€ð°ð´ð¸
+ð¸
+ð¸ð·
+ð¸ð»ð¸
+ð¸ð¼
+ð¸ð¼ð°
+ð¸ð¼ð°ñ‚
+ð¸ñ\81ðºð°
+ð¹
+ðºð°ð·ð°
+ðºð°ðº
+ðºð°ðºð²ð°
+ðºð°ðºð²ð¾
+ðºð°ðºñšð²
+ðºð°ðºñ‚ð¾
+ðºð°ñ‚ð¾
+ðºð¾ð³ð°
+ðºð¾ð³ð°ñ‚ð¾
+ðºð¾ðµñ‚ð¾
+ðºð¾ð¸ñ‚ð¾
+ðºð¾ð¹
+ðºð¾ð¹ñ‚ð¾
+ðºð¾ð»ðºð¾
+ðºð¾ñ\8fñ‚ð¾
+ðºñšð´ðµ
+ðºñšð´ðµñ‚ð¾
+ðºñšð¼
+ð»ðµñ\81ðµð½
+ð»ðµñ\81ð½ð¾
+ð»ð¸
+ð»ð¾ñˆ
+ð¼
+ð¼ð°ð¹
+ð¼ð°ð»ðºð¾
+ð¼ðµ
+ð¼ðµð¶ð´ñƒ
+ð¼ðµðº
+ð¼ðµð½
+ð¼ðµñ\81ðµñ†
+ð¼ð¸
+ð¼ð½ð¾ð³ð¾
+ð¼ð½ð¾ð·ð¸ð½ð°
+ð¼ð¾ð³ð°
+ð¼ð¾ð³ð°ñ‚
+ð¼ð¾ð¶ðµ
+ð¼ð¾ðºñšñ€
+ð¼ð¾ð»ñ\8f
+ð¼ð¾ð¼ðµð½ñ‚ð°
+ð¼ñƒ
+ð½
+ð½ð°
+ð½ð°ð´
+ð½ð°ð·ð°ð´
+ð½ð°ð¹
+ð½ð°ð¿ñ€ð°ð²ð¸
+ð½ð°ð¿ñ€ðµð´
+ð½ð°ð¿ñ€ð¸ð¼ðµñ€
+ð½ð°ñ\81
+ð½ðµ
+ð½ðµð³ð¾
+ð½ðµñ\8f
+ð½ðµñ‰ð¾
+ð½ð¸
+ð½ð¸ðµ
+ð½ð¸ðºð¾ð¹
+ð½ð¸ñ‚ð¾
+ð½ð¸ñ‰ð¾
+ð½ð¾
+ð½ð¾ð²
+ð½ð¾ð²ð°
+ð½ð¾ð²ð¸
+ð½ð¾ð²ð¸ð½ð°
+ð½ñ\8fðºð¾ð¸
+ð½ñ\8fðºð¾ð¹
+ð½ñ\8fðºð¾ð»ðºð¾
+ð½ñ\8fð¼ð°
+ð¾ð±ð°ñ‡ðµ
+ð¾ðºð¾ð»ð¾
+ð¾ñ\81ð²ðµð½
+ð¾ñ\81ð¾ð±ðµð½ð¾
+ð¾ñ‚
+ð¾ñ‚ð³ð¾ñ€ðµ
+ð¾ñ‚ð½ð¾ð²ð¾
+ð¾ñ‰ðµ
+ð¿ð°ðº
+ð¿ð¾
+ð¿ð¾ð²ðµñ‡ðµ
+ð¿ð¾ð²ðµñ‡ðµñ‚ð¾
+ð¿ð¾ð´
+ð¿ð¾ð½ðµ
+ð¿ð¾ñ\81ð»ðµ
+ð¿ð¾ñ‡ñ‚ð¸
+ð¿ð¾ñ€ð°ð´ð¸
+ð¿ñšðº
+ð¿ñšñ‚ð¸
+ð¿ñšñ€ð²ð°ñ‚ð°
+ð¿ñšñ€ð²ð¸
+ð¿ñšñ€ð²ð¾
+ð¿ñ€ð°ð²ð¸
+ð¿ñ€ðµð´
+ð¿ñ€ðµð´ð¸
+ð¿ñ€ðµð·
+ð¿ñ€ð¸
+ñ\81
+ñ\81ð°
+ñ\81ð°ð¼
+ñ\81ð°ð¼ð¾
+ñ\81ðµ
+ñ\81ðµð³ð°
+ñ\81ð¸
+ñ\81ð¸ð½
+ñ\81ðºð¾ñ€ð¾
+ñ\81ð»ðµð´
+ñ\81ð»ðµð´ð²ð°ñ‰
+ñ\81ð¼ðµ
+ñ\81ð¼ñ\8fñ…
+ñ\81ð¿ð¾ñ€ðµð´
+ñ\81ñšð¼
+ñ\81ñšñ\81
+ñ\81ñšñ‰ð¾
+ñ\81ñ‚ðµ
+ñ\81ñ€ðµð´
+ñ\81ñ€ðµñ‰ñƒ
+ñ\8f
+ñ\8fðº
+ñžð¼ñ€ñƒðº
+ñƒ
+ñƒñ‚ñ€ðµ
+ñ‚
+ñ‚.ð½.
+ñ‚ð°ð·ð¸
+ñ‚ð°ðºð°
+ñ‚ð°ðºð¸ð²ð°
+ñ‚ð°ðºñšð²
+ñ‚ð°ð¼
+ñ‚ð²ð¾ð¹
+ñ‚ðµ
+ñ‚ðµð·ð¸
+ñ‚ð¸
+ñ‚ð¾
+ñ‚ð¾ð²ð°
+ñ‚ð¾ð³ð°ð²ð°
+ñ‚ð¾ð·ð¸
+ñ‚ð¾ð¹
+ñ‚ð¾ð»ðºð¾ð²ð°
+ñ‚ð¾ñ‡ð½ð¾
+ñ‚ñ\8f
+ñ‚ñ\8fñ…
+ñ‚ñšð¹
+ñ‚ñƒðº
+ñ‚ñ€ð¸
+ñ‚ñ€ñ\8fð±ð²ð°
+ñ‡
+ñ‡ð°ñ\81ð°
+ñ‡ðµ
+ñ‡ðµñ\81ñ‚ð¾
+ñ‡ñ€ðµð·
+ñ…ð°ñ€ðµñ\81ð²ð°
+ñ…ð¸ð»ñ\8fð´ð¸
+ñ‰ðµ
+ñ‰ð¾ð¼
+ñ€ð°ð²ðµð½
+ñ€ð°ð²ð½ð°
+а
+автентичен
+аз
+ако
+ала
+бе
+без
+беше
+би
+бивш
+бивша
+бившо
+бил
+била
+били
+било
+благодаря
+близо
+бъдат
+бъде
+бяха
+в
+вас
+ваш
+ваша
+вероятно
+вече
+взема
+ви
+вие
+винаги
+внимава
+време
+все
+всеки
+всички
+всичко
+всяка
+във
+въпреки
+върху
+г
+ги
+главен
+главна
+главно
+глас
+го
+година
+години
+годишен
+д
+да
+дали
+два
+двама
+двамата
+две
+двете
+ден
+днес
+дни
+до
+добра
+добре
+добро
+добър
+докато
+докога
+дори
+досега
+доста
+друг
+друга
+други
+е
+евтин
+едва
+един
+една
+еднаква
+еднакви
+еднакъв
+едно
+екип
+ето
+живот
+за
+забавям
+зад
+заедно
+заради
+засега
+заспал
+затова
+защо
+защото
+и
+из
+или
+им
+има
+имат
+иска
+й
+каза
+как
+каква
+какво
+както
+какъв
+като
+кога
+когато
+което
+които
+кой
+който
+колко
+която
+къде
+където
+към
+лесен
+лесно
+ли
+лош
+м
+май
+малко
+ме
+между
+мек
+мен
+месец
+ми
+много
+мнозина
+мога
+могат
+може
+мокър
+моля
+момента
+му
+н
+на
+над
+назад
+най
+направи
+напред
+например
+нас
+не
+него
+нещо
+нея
+ни
+ние
+никой
+нито
+нищо
+но
+нов
+нова
+нови
+новина
+някои
+някой
+няколко
+няма
+обаче
+около
+освен
+особено
+от
+отгоре
+отново
+още
+пак
+по
+повече
+повечето
+под
+поне
+поради
+после
+почти
+прави
+пред
+преди
+през
+при
+пък
+първата
+първи
+първо
+пъти
+равен
+равна
+с
+са
+сам
+само
+се
+сега
+си
+син
+скоро
+след
+следващ
+сме
+смях
+според
+сред
+срещу
+сте
+съм
+със
+също
+т
+т.н.
+тази
+така
+такива
+такъв
+там
+твой
+те
+тези
+ти
+то
+това
+тогава
+този
+той
+толкова
+точно
+три
+трябва
+тук
+тъй
+тя
+тях
+у
+утре
+харесва
+хиляди
+ч
+часа
+че
+често
+чрез
+ще
+щом
+юмрук
+я
+як
+\ No newline at end of file
diff --git a/static/stopwords/bn b/static/stopwords/bn

new file mode 100644 (file)

index 0000000..9dc1bfc
--- /dev/null
+++ b/static/stopwords/bn
@@ -0,0 +1,398 @@
+অতএব
+অথচ
+অথবা
+অনুযায়ী
+অনেক
+অনেকে
+অনেকেই
+অন্তত
+অন্য
+অবধি
+অবশ্য
+অর্থাত
+আই
+আগামী
+আগে
+আগেই
+আছে
+আজ
+আদ্যভাগে
+আপনার
+আপনি
+আবার
+আমরা
+আমাকে
+আমাদের
+আমার
+আমি
+আর
+আরও
+ই
+ইত্যাদি
+ইহা
+উচিত
+উত্তর
+উনি
+উপর
+উপরে
+এ
+এঁদের
+এঁরা
+এই
+একই
+একটি
+একবার
+একে
+এক্
+এখন
+এখনও
+এখানে
+এখানেই
+এটা
+এটাই
+এটি
+এত
+এতটাই
+এতে
+এদের
+এব
+এবং
+এবার
+এমন
+এমনকী
+এমনি
+এর
+এরা
+এল
+এস
+এসে
+ঐ
+ও
+ওঁদের
+ওঁর
+ওঁরা
+ওই
+ওকে
+ওখানে
+ওদের
+ওর
+ওরা
+কখনও
+কত
+কবে
+কমনে
+কয়েক
+কয়েকটি
+করছে
+করছেন
+করতে
+করবে
+করবেন
+করলে
+করলেন
+করা
+করাই
+করায়
+করার
+করি
+করিতে
+করিয়া
+করিয়ে
+করে
+করেই
+করেছিলেন
+করেছে
+করেছেন
+করেন
+কাউকে
+কাছ
+কাছে
+কাজ
+কাজে
+কারও
+কারণ
+কি
+কিংবা
+কিছু
+কিছুই
+কিন্তু
+কী
+কে
+কেউ
+কেউই
+কেখা
+কেন
+কোটি
+কোন
+কোনও
+কোনো
+ক্ষেত্রে
+কয়েক
+খুব
+গিয়ে
+গিয়েছে
+গিয়ে
+গুলি
+গেছে
+গেল
+গেলে
+গোটা
+চলে
+চান
+চায়
+চার
+চালু
+চেয়ে
+চেষ্টা
+ছাড়া
+ছাড়াও
+ছিল
+ছিলেন
+জন
+জনকে
+জনের
+জন্য
+জন্যওজে
+জানতে
+জানা
+জানানো
+জানায়
+জানিয়ে
+জানিয়েছে
+জে
+জ্নজন
+টি
+ঠিক
+তখন
+তত
+তথা
+তবু
+তবে
+তা
+তাঁকে
+তাঁদের
+তাঁর
+তাঁরা
+তাঁাহারা
+তাই
+তাও
+তাকে
+তাতে
+তাদের
+তার
+তারপর
+তারা
+তারৈ
+তাহলে
+তাহা
+তাহাতে
+তাহার
+তিনঐ
+তিনি
+তিনিও
+তুমি
+তুলে
+তেমন
+তো
+তোমার
+থাকবে
+থাকবেন
+থাকা
+থাকায়
+থাকে
+থাকেন
+থেকে
+থেকেই
+থেকেও
+দিকে
+দিতে
+দিন
+দিয়ে
+দিয়েছে
+দিয়েছেন
+দিলেন
+দু
+দুই
+দুটি
+দুটো
+দেওয়া
+দেওয়ার
+দেওয়া
+দেখতে
+দেখা
+দেখে
+দেন
+দেয়
+দ্বারা
+ধরা
+ধরে
+ধামার
+নতুন
+নয়
+না
+নাই
+নাকি
+নাগাদ
+নানা
+নিজে
+নিজেই
+নিজেদের
+নিজের
+নিতে
+নিয়ে
+নিয়ে
+নেই
+নেওয়া
+নেওয়ার
+নেওয়া
+নয়
+পক্ষে
+পর
+পরে
+পরেই
+পরেও
+পর্যন্ত
+পাওয়া
+পাচ
+পারি
+পারে
+পারেন
+পি
+পেয়ে
+পেয়্র্
+প্রতি
+প্রথম
+প্রভৃতি
+প্রযন্ত
+প্রাথমিক
+প্রায়
+প্রায়
+ফলে
+ফিরে
+ফের
+বক্তব্য
+বদলে
+বন
+বরং
+বলতে
+বলল
+বললেন
+বলা
+বলে
+বলেছেন
+বলেন
+বসে
+বহু
+বা
+বাদে
+বার
+বি
+বিনা
+বিভিন্ন
+বিশেষ
+বিষয়টি
+বেশ
+বেশি
+ব্যবহার
+ব্যাপারে
+ভাবে
+ভাবেই
+মতো
+মতোই
+মধ্যভাগে
+মধ্যে
+মধ্যেই
+মধ্যেও
+মনে
+মাত্র
+মাধ্যমে
+মোট
+মোটেই
+যখন
+যত
+যতটা
+যথেষ্ট
+যদি
+যদিও
+যা
+যাঁর
+যাঁরা
+যাওয়া
+যাওয়ার
+যাওয়া
+যাকে
+যাচ্ছে
+যাতে
+যাদের
+যান
+যাবে
+যায়
+যার
+যারা
+যিনি
+যে
+যেখানে
+যেতে
+যেন
+যেমন
+র
+রকম
+রয়েছে
+রাখা
+রেখে
+লক্ষ
+শুধু
+শুরু
+সঙ্গে
+সঙ্গেও
+সব
+সবার
+সমস্ত
+সম্প্রতি
+সহ
+সহিত
+সাধারণ
+সামনে
+সি
+সুতরাং
+সে
+সেই
+সেখান
+সেখানে
+সেটা
+সেটাই
+সেটাও
+সেটি
+স্পষ্ট
+স্বয়ং
+হইতে
+হইবে
+হইয়া
+হওয়া
+হওয়ায়
+হওয়ার
+হচ্ছে
+হত
+হতে
+হতেই
+হন
+হবে
+হবেন
+হয়
+হয়তো
+হয়নি
+হয়ে
+হয়েই
+হয়েছিল
+হয়েছে
+হয়েছেন
+হল
+হলে
+হলেই
+হলেও
+হলো
+হাজার
+হিসাবে
+হৈলে
+হোক
+হয়
+\ No newline at end of file
diff --git a/static/stopwords/br b/static/stopwords/br

new file mode 100644 (file)

index 0000000..5ade8a4
--- /dev/null
+++ b/static/stopwords/br
@@ -0,0 +1,126 @@
+a
+ainda
+alem
+ambas
+ambos
+antes
+ao
+aonde
+aos
+apos
+aquele
+aqueles
+as
+assim
+com
+como
+contra
+contudo
+cuja
+cujas
+cujo
+cujos
+da
+das
+de
+dela
+dele
+deles
+demais
+depois
+desde
+desta
+deste
+dispoe
+dispoem
+diversa
+diversas
+diversos
+do
+dos
+durante
+e
+ela
+elas
+ele
+eles
+em
+entao
+entre
+essa
+essas
+esse
+esses
+esta
+estas
+este
+estes
+ha
+isso
+isto
+logo
+mais
+mas
+mediante
+menos
+mesma
+mesmas
+mesmo
+mesmos
+na
+nao
+nas
+nem
+nesse
+neste
+nos
+o
+os
+ou
+outra
+outras
+outro
+outros
+pelas
+pelo
+pelos
+perante
+pois
+por
+porque
+portanto
+propios
+proprio
+quais
+qual
+qualquer
+quando
+quanto
+que
+quem
+quer
+se
+seja
+sem
+sendo
+seu
+seus
+sob
+sobre
+sua
+suas
+tal
+tambem
+teu
+teus
+toda
+todas
+todo
+todos
+tua
+tuas
+tudo
+um
+uma
+umas
+uns
+\ No newline at end of file
diff --git a/static/stopwords/ca b/static/stopwords/ca

new file mode 100644 (file)

index 0000000..cdba332
--- /dev/null
+++ b/static/stopwords/ca
@@ -0,0 +1,278 @@
+a
+abans
+ací
+ah
+així
+això
+al
+aleshores
+algun
+alguna
+algunes
+alguns
+alhora
+allà
+allí
+allò
+als
+altra
+altre
+altres
+amb
+ambdues
+ambdós
+anar
+ans
+apa
+aquell
+aquella
+aquelles
+aquells
+aquest
+aquesta
+aquestes
+aquests
+aquí
+baix
+bastant
+bé
+cada
+cadascuna
+cadascunes
+cadascuns
+cadascú
+com
+consegueixo
+conseguim
+conseguir
+consigueix
+consigueixen
+consigueixes
+contra
+d'un
+d'una
+d'unes
+d'uns
+dalt
+de
+del
+dels
+des
+des de
+després
+dins
+dintre
+donat
+doncs
+durant
+e
+eh
+el
+elles
+ells
+els
+em
+en
+encara
+ens
+entre
+era
+erem
+eren
+eres
+es
+esta
+estan
+estat
+estava
+estaven
+estem
+esteu
+estic
+està
+estàvem
+estàveu
+et
+etc
+ets
+fa
+faig
+fan
+fas
+fem
+fer
+feu
+fi
+fins
+fora
+gairebé
+ha
+han
+has
+haver
+havia
+he
+hem
+heu
+hi
+ho
+i
+igual
+iguals
+inclòs
+ja
+jo
+l'hi
+la
+les
+li
+li'n
+llarg
+llavors
+m'he
+ma
+mal
+malgrat
+mateix
+mateixa
+mateixes
+mateixos
+me
+mentre
+meu
+meus
+meva
+meves
+mode
+molt
+molta
+moltes
+molts
+mon
+mons
+més
+n'he
+n'hi
+ne
+ni
+no
+nogensmenys
+només
+nosaltres
+nostra
+nostre
+nostres
+o
+oh
+oi
+on
+pas
+pel
+pels
+per
+per que
+perquè
+però
+poc
+poca
+pocs
+podem
+poden
+poder
+podeu
+poques
+potser
+primer
+propi
+puc
+qual
+quals
+quan
+quant
+que
+quelcom
+qui
+quin
+quina
+quines
+quins
+què
+s'ha
+s'han
+sa
+sabem
+saben
+saber
+sabeu
+sap
+saps
+semblant
+semblants
+sense
+ser
+ses
+seu
+seus
+seva
+seves
+si
+sobre
+sobretot
+soc
+solament
+sols
+som
+son
+sons
+sota
+sou
+sóc
+són
+t'ha
+t'han
+t'he
+ta
+tal
+també
+tampoc
+tan
+tant
+tanta
+tantes
+te
+tene
+tenim
+tenir
+teniu
+teu
+teus
+teva
+teves
+tinc
+ton
+tons
+tot
+tota
+totes
+tots
+un
+una
+unes
+uns
+us
+va
+vaig
+vam
+van
+vas
+veu
+vosaltres
+vostra
+vostre
+vostres
+érem
+éreu
+és
+éssent
+últim
+ús
+\ No newline at end of file
diff --git a/static/stopwords/cs b/static/stopwords/cs

new file mode 100644 (file)

index 0000000..39b2968
--- /dev/null
+++ b/static/stopwords/cs
@@ -0,0 +1,550 @@
+a
+aby
+ahoj
+aj
+ale
+anebo
+ani
+aniž
+ano
+asi
+aspoåˆ
+aspoň
+atd
+atp
+az
+aä\8dkoli
+ačkoli
+až
+bez
+beze
+blãzko
+blízko
+bohuå¾el
+bohužel
+brzo
+bude
+budem
+budeme
+budes
+budete
+budeå¡
+budeš
+budou
+budu
+by
+byl
+byla
+byli
+bylo
+byly
+bys
+byt
+bä›hem
+být
+během
+chce
+chceme
+chcete
+chceå¡
+chceš
+chci
+chtãt
+chtä›jã
+chtít
+chtějí
+chut'
+chuti
+ci
+clanek
+clanku
+clanky
+co
+coz
+což
+cz
+daleko
+dalsi
+další
+den
+deset
+design
+devatenáct
+devatenã¡ct
+devä›t
+devět
+dnes
+do
+dobrã½
+dobrý
+docela
+dva
+dvacet
+dvanáct
+dvanã¡ct
+dvä›
+dvě
+dál
+dále
+dã¡l
+dã¡le
+dä›kovat
+dä›kujeme
+dä›kuji
+děkovat
+děkujeme
+děkuji
+email
+ho
+hodnä›
+hodně
+i
+jak
+jakmile
+jako
+jakož
+jde
+je
+jeden
+jedenáct
+jedenã¡ct
+jedna
+jedno
+jednou
+jedou
+jeho
+jehož
+jej
+jeji
+jejich
+jejã
+její
+jelikož
+jemu
+jen
+jenom
+jenž
+jeste
+jestli
+jestliå¾e
+jestliže
+jeå¡tä›
+ještě
+jež
+ji
+jich
+jimi
+jinak
+jine
+jiné
+jiz
+již
+jsem
+jses
+jseš
+jsi
+jsme
+jsou
+jste
+já
+jã¡
+jã
+jãm
+jí
+jím
+jíž
+jšte
+k
+kam
+každý
+kde
+kdo
+kdy
+kdyz
+kdyå¾
+když
+ke
+kolik
+kromä›
+kromě
+ktera
+ktere
+kteri
+kterou
+ktery
+která
+kterã¡
+kterã©
+kterã½
+které
+který
+kteå™ã
+kteři
+kteří
+ku
+kvå¯li
+kvůli
+ma
+majã
+mají
+mate
+me
+mezi
+mi
+mit
+mne
+mnou
+mnä›
+mně
+moc
+mohl
+mohou
+moje
+moji
+moå¾nã¡
+možná
+muj
+musã
+musí
+muze
+my
+má
+málo
+mám
+máme
+máte
+máš
+mã¡
+mã¡lo
+mã¡m
+mã¡me
+mã¡te
+mã¡å¡
+mã©
+mã
+mãt
+mä›
+må¯j
+må¯å¾e
+mé
+mí
+mít
+mě
+můj
+může
+na
+nad
+nade
+nam
+napiste
+napište
+naproti
+nas
+nasi
+naå¡e
+naå¡i
+načež
+naše
+naši
+ne
+nebo
+nebyl
+nebyla
+nebyli
+nebyly
+nechť
+nedä›lajã
+nedä›lã¡
+nedä›lã¡m
+nedä›lã¡me
+nedä›lã¡te
+nedä›lã¡å¡
+nedělají
+nedělá
+nedělám
+neděláme
+neděláte
+neděláš
+neg
+nejsi
+nejsou
+nemajã
+nemají
+nemáme
+nemáte
+nemã¡me
+nemã¡te
+nemä›l
+neměl
+neni
+nenã
+není
+nestaä\8dã
+nestačí
+nevadã
+nevadí
+nez
+neå¾
+než
+nic
+nich
+nimi
+nove
+novy
+nové
+nový
+nula
+ná
+nám
+námi
+nás
+náš
+nã¡m
+nã¡mi
+nã¡s
+nã¡å¡
+nãm
+nä›
+nä›co
+nä›jak
+nä›kde
+nä›kdo
+nä›mu
+ní
+ním
+ně
+něco
+nějak
+někde
+někdo
+němu
+němuž
+o
+od
+ode
+on
+ona
+oni
+ono
+ony
+osm
+osmnáct
+osmnã¡ct
+pak
+patnáct
+patnã¡ct
+po
+pod
+podle
+pokud
+potom
+pouze
+pozdä›
+pozdě
+poå™ã¡d
+pořád
+prave
+pravé
+pred
+pres
+pri
+pro
+proc
+prostä›
+prostě
+prosãm
+prosím
+proti
+proto
+protoze
+protoå¾e
+protože
+proä\8d
+proč
+prvni
+první
+práve
+pta
+pä›t
+på™ed
+på™es
+på™ese
+pět
+před
+přede
+přes
+přese
+při
+přičemž
+re
+rovnä›
+rovně
+s
+se
+sedm
+sedmnáct
+sedmnã¡ct
+si
+sice
+skoro
+smã
+smä›jã
+smí
+smějí
+snad
+spolu
+sta
+sto
+strana
+stã©
+sté
+sve
+svych
+svym
+svymi
+své
+svých
+svým
+svými
+svůj
+ta
+tady
+tak
+take
+takhle
+taky
+takze
+také
+takže
+tam
+tamhle
+tamhleto
+tamto
+tato
+te
+tebe
+tebou
+ted'
+tedy
+tema
+ten
+tento
+teto
+ti
+tim
+timto
+tipy
+tisãc
+tisãce
+tisíc
+tisíce
+to
+tobä›
+tobě
+tohle
+toho
+tohoto
+tom
+tomto
+tomu
+tomuto
+toto
+troå¡ku
+trošku
+tu
+tuto
+tvoje
+tvá
+tvã¡
+tvã©
+två¯j
+tvé
+tvůj
+ty
+tyto
+tä›
+tå™eba
+tå™i
+tå™inã¡ct
+téma
+této
+tím
+tímto
+tě
+těm
+těma
+těmu
+třeba
+tři
+třináct
+u
+urä\8ditä›
+určitě
+uz
+uå¾
+už
+v
+vam
+vas
+vase
+vaå¡e
+vaå¡i
+vaše
+vaši
+ve
+vedle
+veä\8der
+večer
+vice
+vlastnä›
+vlastně
+vsak
+vy
+vám
+vámi
+vás
+váš
+vã¡m
+vã¡mi
+vã¡s
+vã¡å¡
+vå¡echno
+vå¡ichni
+vå¯bec
+vå¾dy
+více
+však
+všechen
+všechno
+všichni
+vůbec
+vždy
+z
+za
+zatãmco
+zatímco
+zaä\8d
+zač
+zda
+zde
+ze
+zpet
+zpravy
+zprávy
+zpět
+ä\8dau
+ä\8dtrnã¡ct
+ä\8dtyå™i
+å¡est
+å¡estnã¡ct
+å¾e
+čau
+či
+článek
+článku
+články
+čtrnáct
+čtyři
+šest
+šestnáct
+že
+\ No newline at end of file
diff --git a/static/stopwords/da b/static/stopwords/da

new file mode 100644 (file)

index 0000000..3d441a5
--- /dev/null
+++ b/static/stopwords/da
@@ -0,0 +1,170 @@
+ad
+af
+aldrig
+alle
+alt
+anden
+andet
+andre
+at
+bare
+begge
+blev
+blive
+bliver
+da
+de
+dem
+den
+denne
+der
+deres
+det
+dette
+dig
+din
+dine
+disse
+dit
+dog
+du
+efter
+ej
+eller
+en
+end
+ene
+eneste
+enhver
+er
+et
+far
+fem
+fik
+fire
+flere
+fleste
+for
+fordi
+forrige
+fra
+få
+får
+før
+god
+godt
+ham
+han
+hans
+har
+havde
+have
+hej
+helt
+hende
+hendes
+her
+hos
+hun
+hvad
+hvem
+hver
+hvilken
+hvis
+hvor
+hvordan
+hvorfor
+hvornår
+i
+ikke
+ind
+ingen
+intet
+ja
+jeg
+jer
+jeres
+jo
+kan
+kom
+komme
+kommer
+kun
+kunne
+lad
+lav
+lidt
+lige
+lille
+man
+mand
+mange
+med
+meget
+men
+mens
+mere
+mig
+min
+mine
+mit
+mod
+må
+ned
+nej
+ni
+nogen
+noget
+nogle
+nu
+ny
+nyt
+når
+nær
+næste
+næsten
+og
+også
+okay
+om
+op
+os
+otte
+over
+på
+se
+seks
+selv
+ser
+ses
+sig
+sige
+sin
+sine
+sit
+skal
+skulle
+som
+stor
+store
+syv
+så
+sådan
+tag
+tage
+thi
+ti
+til
+to
+tre
+ud
+under
+var
+ved
+vi
+vil
+ville
+vor
+vores
+være
+været
+\ No newline at end of file
diff --git a/static/stopwords/de b/static/stopwords/de

new file mode 100644 (file)

index 0000000..24666a6
--- /dev/null
+++ b/static/stopwords/de
@@ -0,0 +1,621 @@
+a
+ab
+aber
+ach
+acht
+achte
+achten
+achter
+achtes
+ag
+alle
+allein
+allem
+allen
+aller
+allerdings
+alles
+allgemeinen
+als
+also
+am
+an
+ander
+andere
+anderem
+anderen
+anderer
+anderes
+anderm
+andern
+anderr
+anders
+au
+auch
+auf
+aus
+ausser
+ausserdem
+außer
+außerdem
+b
+bald
+bei
+beide
+beiden
+beim
+beispiel
+bekannt
+bereits
+besonders
+besser
+besten
+bin
+bis
+bisher
+bist
+c
+d
+d.h
+da
+dabei
+dadurch
+dafür
+dagegen
+daher
+dahin
+dahinter
+damals
+damit
+danach
+daneben
+dank
+dann
+daran
+darauf
+daraus
+darf
+darfst
+darin
+darum
+darunter
+darüber
+das
+dasein
+daselbst
+dass
+dasselbe
+davon
+davor
+dazu
+dazwischen
+daß
+dein
+deine
+deinem
+deinen
+deiner
+deines
+dem
+dementsprechend
+demgegenüber
+demgemäss
+demgemäß
+demselben
+demzufolge
+den
+denen
+denn
+denselben
+der
+deren
+derer
+derjenige
+derjenigen
+dermassen
+dermaßen
+derselbe
+derselben
+des
+deshalb
+desselben
+dessen
+deswegen
+dich
+die
+diejenige
+diejenigen
+dies
+diese
+dieselbe
+dieselben
+diesem
+diesen
+dieser
+dieses
+dir
+doch
+dort
+drei
+drin
+dritte
+dritten
+dritter
+drittes
+du
+durch
+durchaus
+durfte
+durften
+dürfen
+dürft
+e
+eben
+ebenso
+ehrlich
+ei
+ei,
+eigen
+eigene
+eigenen
+eigener
+eigenes
+ein
+einander
+eine
+einem
+einen
+einer
+eines
+einig
+einige
+einigem
+einigen
+einiger
+einiges
+einmal
+eins
+elf
+en
+ende
+endlich
+entweder
+er
+ernst
+erst
+erste
+ersten
+erster
+erstes
+es
+etwa
+etwas
+euch
+euer
+eure
+eurem
+euren
+eurer
+eures
+f
+folgende
+früher
+fünf
+fünfte
+fünften
+fünfter
+fünftes
+für
+g
+gab
+ganz
+ganze
+ganzen
+ganzer
+ganzes
+gar
+gedurft
+gegen
+gegenüber
+gehabt
+gehen
+geht
+gekannt
+gekonnt
+gemacht
+gemocht
+gemusst
+genug
+gerade
+gern
+gesagt
+geschweige
+gewesen
+gewollt
+geworden
+gibt
+ging
+gleich
+gott
+gross
+grosse
+grossen
+grosser
+grosses
+groß
+große
+großen
+großer
+großes
+gut
+gute
+guter
+gutes
+h
+hab
+habe
+haben
+habt
+hast
+hat
+hatte
+hatten
+hattest
+hattet
+heisst
+her
+heute
+hier
+hin
+hinter
+hoch
+hätte
+hätten
+i
+ich
+ihm
+ihn
+ihnen
+ihr
+ihre
+ihrem
+ihren
+ihrer
+ihres
+im
+immer
+in
+indem
+infolgedessen
+ins
+irgend
+ist
+j
+ja
+jahr
+jahre
+jahren
+je
+jede
+jedem
+jeden
+jeder
+jedermann
+jedermanns
+jedes
+jedoch
+jemand
+jemandem
+jemanden
+jene
+jenem
+jenen
+jener
+jenes
+jetzt
+k
+kam
+kann
+kannst
+kaum
+kein
+keine
+keinem
+keinen
+keiner
+keines
+kleine
+kleinen
+kleiner
+kleines
+kommen
+kommt
+konnte
+konnten
+kurz
+können
+könnt
+könnte
+l
+lang
+lange
+leicht
+leide
+lieber
+los
+m
+machen
+macht
+machte
+mag
+magst
+mahn
+mal
+man
+manche
+manchem
+manchen
+mancher
+manches
+mann
+mehr
+mein
+meine
+meinem
+meinen
+meiner
+meines
+mensch
+menschen
+mich
+mir
+mit
+mittel
+mochte
+mochten
+morgen
+muss
+musst
+musste
+mussten
+muß
+mußt
+möchte
+mögen
+möglich
+mögt
+müssen
+müsst
+müßt
+n
+na
+nach
+nachdem
+nahm
+natürlich
+neben
+nein
+neue
+neuen
+neun
+neunte
+neunten
+neunter
+neuntes
+nicht
+nichts
+nie
+niemand
+niemandem
+niemanden
+noch
+nun
+nur
+o
+ob
+oben
+oder
+offen
+oft
+ohne
+ordnung
+p
+q
+r
+recht
+rechte
+rechten
+rechter
+rechtes
+richtig
+rund
+s
+sa
+sache
+sagt
+sagte
+sah
+satt
+schlecht
+schluss
+schon
+sechs
+sechste
+sechsten
+sechster
+sechstes
+sehr
+sei
+seid
+seien
+sein
+seine
+seinem
+seinen
+seiner
+seines
+seit
+seitdem
+selbst
+sich
+sie
+sieben
+siebente
+siebenten
+siebenter
+siebentes
+sind
+so
+solang
+solche
+solchem
+solchen
+solcher
+solches
+soll
+sollen
+sollst
+sollt
+sollte
+sollten
+sondern
+sonst
+soweit
+sowie
+später
+startseite
+statt
+steht
+suche
+t
+tag
+tage
+tagen
+tat
+teil
+tel
+tritt
+trotzdem
+tun
+u
+uhr
+um
+und
+und?
+uns
+unse
+unsem
+unsen
+unser
+unsere
+unserer
+unses
+unter
+v
+vergangenen
+viel
+viele
+vielem
+vielen
+vielleicht
+vier
+vierte
+vierten
+vierter
+viertes
+vom
+von
+vor
+w
+wahr?
+wann
+war
+waren
+warst
+wart
+warum
+was
+weg
+wegen
+weil
+weit
+weiter
+weitere
+weiteren
+weiteres
+welche
+welchem
+welchen
+welcher
+welches
+wem
+wen
+wenig
+wenige
+weniger
+weniges
+wenigstens
+wenn
+wer
+werde
+werden
+werdet
+weshalb
+wessen
+wie
+wieder
+wieso
+will
+willst
+wir
+wird
+wirklich
+wirst
+wissen
+wo
+woher
+wohin
+wohl
+wollen
+wollt
+wollte
+wollten
+worden
+wurde
+wurden
+während
+währenddem
+währenddessen
+wäre
+würde
+würden
+x
+y
+z
+z.b
+zehn
+zehnte
+zehnten
+zehnter
+zehntes
+zeit
+zu
+zuerst
+zugleich
+zum
+zunächst
+zur
+zurück
+zusammen
+zwanzig
+zwar
+zwei
+zweite
+zweiten
+zweiter
+zweites
+zwischen
+zwölf
+über
+überhaupt
+übrigens
+\ No newline at end of file
diff --git a/static/stopwords/el b/static/stopwords/el

new file mode 100644 (file)

index 0000000..052606b
--- /dev/null
+++ b/static/stopwords/el
@@ -0,0 +1,265 @@
+αλλα
+αν
+αντι
+απο
+αυτα
+αυτεσ
+αυτη
+αυτο
+αυτοι
+αυτοσ
+αυτουσ
+αυτων
+αἱ
+αἳ
+αἵ
+αὐτόσ
+αὐτὸς
+αὖ
+γάρ
+γα
+γα^
+γε
+για
+γοῦν
+γὰρ
+δ'
+δέ
+δή
+δαί
+δαίσ
+δαὶ
+δαὶς
+δε
+δεν
+δι'
+διά
+διὰ
+δὲ
+δὴ
+δ’
+εαν
+ειμαι
+ειμαστε
+ειναι
+εισαι
+ειστε
+εκεινα
+εκεινεσ
+εκεινη
+εκεινο
+εκεινοι
+εκεινοσ
+εκεινουσ
+εκεινων
+ενω
+επ
+επι
+εἰ
+εἰμί
+εἰμὶ
+εἰς
+εἰσ
+εἴ
+εἴμι
+εἴτε
+η
+θα
+ισωσ
+κ
+καί
+καίτοι
+καθ
+και
+κατ
+κατά
+κατα
+κατὰ
+καὶ
+κι
+κἀν
+κἂν
+μέν
+μή
+μήτε
+μα
+με
+μεθ
+μετ
+μετά
+μετα
+μετὰ
+μη
+μην
+μἐν
+μὲν
+μὴ
+μὴν
+να
+ο
+οι
+ομωσ
+οπωσ
+οσο
+οτι
+οἱ
+οἳ
+οἷς
+οὐ
+οὐδ
+οὐδέ
+οὐδείσ
+οὐδεὶς
+οὐδὲ
+οὐδὲν
+οὐκ
+οὐχ
+οὐχὶ
+οὓς
+οὔτε
+οὕτω
+οὕτως
+οὕτωσ
+οὖν
+οὗ
+οὗτος
+οὗτοσ
+παρ
+παρά
+παρα
+παρὰ
+περί
+περὶ
+ποια
+ποιεσ
+ποιο
+ποιοι
+ποιοσ
+ποιουσ
+ποιων
+ποτε
+που
+ποῦ
+προ
+προσ
+πρόσ
+πρὸ
+πρὸς
+πως
+πωσ
+σε
+στη
+στην
+στο
+στον
+σόσ
+σύ
+σύν
+σὸς
+σὺ
+σὺν
+τά
+τήν
+τί
+τίς
+τίσ
+τα
+ταῖς
+τε
+την
+τησ
+τι
+τινα
+τις
+τισ
+το
+τοί
+τοι
+τοιοῦτος
+τοιοῦτοσ
+τον
+τοτε
+του
+τούσ
+τοὺς
+τοῖς
+τοῦ
+των
+τό
+τόν
+τότε
+τὰ
+τὰς
+τὴν
+τὸ
+τὸν
+τῆς
+τῆσ
+τῇ
+τῶν
+τῷ
+ωσ
+ἀλλ'
+ἀλλά
+ἀλλὰ
+ἀλλ’
+ἀπ
+ἀπό
+ἀπὸ
+ἀφ
+ἂν
+ἃ
+ἄλλος
+ἄλλοσ
+ἄν
+ἄρα
+ἅμα
+ἐάν
+ἐγώ
+ἐγὼ
+ἐκ
+ἐμόσ
+ἐμὸς
+ἐν
+ἐξ
+ἐπί
+ἐπεὶ
+ἐπὶ
+ἐστι
+ἐφ
+ἐὰν
+ἑαυτοῦ
+ἔτι
+ἡ
+ἢ
+ἣ
+ἤ
+ἥ
+ἧς
+ἵνα
+ὁ
+ὃ
+ὃν
+ὃς
+ὅ
+ὅδε
+ὅθεν
+ὅπερ
+ὅς
+ὅσ
+ὅστις
+ὅστισ
+ὅτε
+ὅτι
+ὑμόσ
+ὑπ
+ὑπέρ
+ὑπό
+ὑπὲρ
+ὑπὸ
+ὡς
+ὡσ
+ὥς
+ὥστε
+ὦ
+ᾧ
+\ No newline at end of file
diff --git a/static/stopwords/en b/static/stopwords/en

new file mode 100644 (file)

index 0000000..e095216
--- /dev/null
+++ b/static/stopwords/en
@@ -0,0 +1,1298 @@
+'ll
+'tis
+'twas
+'ve
+10
+39
+a
+a's
+able
+ableabout
+about
+above
+abroad
+abst
+accordance
+according
+accordingly
+across
+act
+actually
+ad
+added
+adj
+adopted
+ae
+af
+affected
+affecting
+affects
+after
+afterwards
+ag
+again
+against
+ago
+ah
+ahead
+ai
+ain't
+aint
+al
+all
+allow
+allows
+almost
+alone
+along
+alongside
+already
+also
+although
+always
+am
+amid
+amidst
+among
+amongst
+amoungst
+amount
+an
+and
+announce
+another
+any
+anybody
+anyhow
+anymore
+anyone
+anything
+anyway
+anyways
+anywhere
+ao
+apart
+apparently
+appear
+appreciate
+appropriate
+approximately
+aq
+ar
+are
+area
+areas
+aren
+aren't
+arent
+arise
+around
+arpa
+as
+aside
+ask
+asked
+asking
+asks
+associated
+at
+au
+auth
+available
+aw
+away
+awfully
+az
+b
+ba
+back
+backed
+backing
+backs
+backward
+backwards
+bb
+bd
+be
+became
+because
+become
+becomes
+becoming
+been
+before
+beforehand
+began
+begin
+beginning
+beginnings
+begins
+behind
+being
+beings
+believe
+below
+beside
+besides
+best
+better
+between
+beyond
+bf
+bg
+bh
+bi
+big
+bill
+billion
+biol
+bj
+bm
+bn
+bo
+both
+bottom
+br
+brief
+briefly
+bs
+bt
+but
+buy
+bv
+bw
+by
+bz
+c
+c'mon
+c's
+ca
+call
+came
+can
+can't
+cannot
+cant
+caption
+case
+cases
+cause
+causes
+cc
+cd
+certain
+certainly
+cf
+cg
+ch
+changes
+ci
+ck
+cl
+clear
+clearly
+click
+cm
+cmon
+cn
+co
+co.
+com
+come
+comes
+computer
+con
+concerning
+consequently
+consider
+considering
+contain
+containing
+contains
+copy
+corresponding
+could
+could've
+couldn
+couldn't
+couldnt
+course
+cr
+cry
+cs
+cu
+currently
+cv
+cx
+cy
+cz
+d
+dare
+daren't
+darent
+date
+de
+dear
+definitely
+describe
+described
+despite
+detail
+did
+didn
+didn't
+didnt
+differ
+different
+differently
+directly
+dj
+dk
+dm
+do
+does
+doesn
+doesn't
+doesnt
+doing
+don
+don't
+done
+dont
+doubtful
+down
+downed
+downing
+downs
+downwards
+due
+during
+dz
+e
+each
+early
+ec
+ed
+edu
+ee
+effect
+eg
+eh
+eight
+eighty
+either
+eleven
+else
+elsewhere
+empty
+end
+ended
+ending
+ends
+enough
+entirely
+er
+es
+especially
+et
+et-al
+etc
+even
+evenly
+ever
+evermore
+every
+everybody
+everyone
+everything
+everywhere
+ex
+exactly
+example
+except
+f
+face
+faces
+fact
+facts
+fairly
+far
+farther
+felt
+few
+fewer
+ff
+fi
+fifteen
+fifth
+fifty
+fify
+fill
+find
+finds
+fire
+first
+five
+fix
+fj
+fk
+fm
+fo
+followed
+following
+follows
+for
+forever
+former
+formerly
+forth
+forty
+forward
+found
+four
+fr
+free
+from
+front
+full
+fully
+further
+furthered
+furthering
+furthermore
+furthers
+fx
+g
+ga
+gave
+gb
+gd
+ge
+general
+generally
+get
+gets
+getting
+gf
+gg
+gh
+gi
+give
+given
+gives
+giving
+gl
+gm
+gmt
+gn
+go
+goes
+going
+gone
+good
+goods
+got
+gotten
+gov
+gp
+gq
+gr
+great
+greater
+greatest
+greetings
+group
+grouped
+grouping
+groups
+gs
+gt
+gu
+gw
+gy
+h
+had
+hadn't
+hadnt
+half
+happens
+hardly
+has
+hasn
+hasn't
+hasnt
+have
+haven
+haven't
+havent
+having
+he
+he'd
+he'll
+he's
+hed
+hell
+hello
+help
+hence
+her
+here
+here's
+hereafter
+hereby
+herein
+heres
+hereupon
+hers
+herself
+herse”
+hes
+hi
+hid
+high
+higher
+highest
+him
+himself
+himse”
+his
+hither
+hk
+hm
+hn
+home
+homepage
+hopefully
+how
+how'd
+how'll
+how's
+howbeit
+however
+hr
+ht
+htm
+html
+http
+hu
+hundred
+i
+i'd
+i'll
+i'm
+i've
+i.e.
+id
+ie
+if
+ignored
+ii
+il
+ill
+im
+immediate
+immediately
+importance
+important
+in
+inasmuch
+inc
+inc.
+indeed
+index
+indicate
+indicated
+indicates
+information
+inner
+inside
+insofar
+instead
+int
+interest
+interested
+interesting
+interests
+into
+invention
+inward
+io
+iq
+ir
+is
+isn
+isn't
+isnt
+it
+it'd
+it'll
+it's
+itd
+itll
+its
+itself
+itse”
+ive
+j
+je
+jm
+jo
+join
+jp
+just
+k
+ke
+keep
+keeps
+kept
+keys
+kg
+kh
+ki
+kind
+km
+kn
+knew
+know
+known
+knows
+kp
+kr
+kw
+ky
+kz
+l
+la
+large
+largely
+last
+lately
+later
+latest
+latter
+latterly
+lb
+lc
+least
+length
+less
+lest
+let
+let's
+lets
+li
+like
+liked
+likely
+likewise
+line
+little
+lk
+ll
+long
+longer
+longest
+look
+looking
+looks
+low
+lower
+lr
+ls
+lt
+ltd
+lu
+lv
+ly
+m
+ma
+made
+mainly
+make
+makes
+making
+man
+many
+may
+maybe
+mayn't
+maynt
+mc
+md
+me
+mean
+means
+meantime
+meanwhile
+member
+members
+men
+merely
+mg
+mh
+microsoft
+might
+might've
+mightn't
+mightnt
+mil
+mill
+million
+mine
+minus
+miss
+mk
+ml
+mm
+mn
+mo
+more
+moreover
+most
+mostly
+move
+mp
+mq
+mr
+mrs
+ms
+msie
+mt
+mu
+much
+mug
+must
+must've
+mustn't
+mustnt
+mv
+mw
+mx
+my
+myself
+myse”
+mz
+n
+na
+name
+namely
+nay
+nc
+nd
+ne
+near
+nearly
+necessarily
+necessary
+need
+needed
+needing
+needn't
+neednt
+needs
+neither
+net
+netscape
+never
+neverf
+neverless
+nevertheless
+new
+newer
+newest
+next
+nf
+ng
+ni
+nine
+ninety
+nl
+no
+no-one
+nobody
+non
+none
+nonetheless
+noone
+nor
+normally
+nos
+not
+noted
+nothing
+notwithstanding
+novel
+now
+nowhere
+np
+nr
+nu
+null
+number
+numbers
+nz
+o
+obtain
+obtained
+obviously
+of
+off
+often
+oh
+ok
+okay
+old
+older
+oldest
+om
+omitted
+on
+once
+one
+one's
+ones
+only
+onto
+open
+opened
+opening
+opens
+opposite
+or
+ord
+order
+ordered
+ordering
+orders
+org
+other
+others
+otherwise
+ought
+oughtn't
+oughtnt
+our
+ours
+ourselves
+out
+outside
+over
+overall
+owing
+own
+p
+pa
+page
+pages
+part
+parted
+particular
+particularly
+parting
+parts
+past
+pe
+per
+perhaps
+pf
+pg
+ph
+pk
+pl
+place
+placed
+places
+please
+plus
+pm
+pmid
+pn
+point
+pointed
+pointing
+points
+poorly
+possible
+possibly
+potentially
+pp
+pr
+predominantly
+present
+presented
+presenting
+presents
+presumably
+previously
+primarily
+probably
+problem
+problems
+promptly
+proud
+provided
+provides
+pt
+put
+puts
+pw
+py
+q
+qa
+que
+quickly
+quite
+qv
+r
+ran
+rather
+rd
+re
+readily
+really
+reasonably
+recent
+recently
+ref
+refs
+regarding
+regardless
+regards
+related
+relatively
+research
+reserved
+respectively
+resulted
+resulting
+results
+right
+ring
+ro
+room
+rooms
+round
+ru
+run
+rw
+s
+sa
+said
+same
+saw
+say
+saying
+says
+sb
+sc
+sd
+se
+sec
+second
+secondly
+seconds
+section
+see
+seeing
+seem
+seemed
+seeming
+seems
+seen
+sees
+self
+selves
+sensible
+sent
+serious
+seriously
+seven
+seventy
+several
+sg
+sh
+shall
+shan't
+shant
+she
+she'd
+she'll
+she's
+shed
+shell
+shes
+should
+should've
+shouldn
+shouldn't
+shouldnt
+show
+showed
+showing
+shown
+showns
+shows
+si
+side
+sides
+significant
+significantly
+similar
+similarly
+since
+sincere
+site
+six
+sixty
+sj
+sk
+sl
+slightly
+sm
+small
+smaller
+smallest
+sn
+so
+some
+somebody
+someday
+somehow
+someone
+somethan
+something
+sometime
+sometimes
+somewhat
+somewhere
+soon
+sorry
+specifically
+specified
+specify
+specifying
+sr
+st
+state
+states
+still
+stop
+strongly
+su
+sub
+substantially
+successfully
+such
+sufficiently
+suggest
+sup
+sure
+sv
+sy
+system
+sz
+t
+t's
+take
+taken
+taking
+tc
+td
+tell
+ten
+tends
+test
+text
+tf
+tg
+th
+than
+thank
+thanks
+thanx
+that
+that'll
+that's
+that've
+thatll
+thats
+thatve
+the
+their
+theirs
+them
+themselves
+then
+thence
+there
+there'd
+there'll
+there're
+there's
+there've
+thereafter
+thereby
+thered
+therefore
+therein
+therell
+thereof
+therere
+theres
+thereto
+thereupon
+thereve
+these
+they
+they'd
+they'll
+they're
+they've
+theyd
+theyll
+theyre
+theyve
+thick
+thin
+thing
+things
+think
+thinks
+third
+thirty
+this
+thorough
+thoroughly
+those
+thou
+though
+thoughh
+thought
+thoughts
+thousand
+three
+throug
+through
+throughout
+thru
+thus
+til
+till
+tip
+tis
+tj
+tk
+tm
+tn
+to
+today
+together
+too
+took
+top
+toward
+towards
+tp
+tr
+tried
+tries
+trillion
+truly
+try
+trying
+ts
+tt
+turn
+turned
+turning
+turns
+tv
+tw
+twas
+twelve
+twenty
+twice
+two
+tz
+u
+ua
+ug
+uk
+um
+un
+under
+underneath
+undoing
+unfortunately
+unless
+unlike
+unlikely
+until
+unto
+up
+upon
+ups
+upwards
+us
+use
+used
+useful
+usefully
+usefulness
+uses
+using
+usually
+uucp
+uy
+uz
+v
+va
+value
+various
+vc
+ve
+versus
+very
+vg
+vi
+via
+viz
+vn
+vol
+vols
+vs
+vu
+w
+want
+wanted
+wanting
+wants
+was
+wasn
+wasn't
+wasnt
+way
+ways
+we
+we'd
+we'll
+we're
+we've
+web
+webpage
+website
+wed
+welcome
+well
+wells
+went
+were
+weren
+weren't
+werent
+weve
+wf
+what
+what'd
+what'll
+what's
+what've
+whatever
+whatll
+whats
+whatve
+when
+when'd
+when'll
+when's
+whence
+whenever
+where
+where'd
+where'll
+where's
+whereafter
+whereas
+whereby
+wherein
+wheres
+whereupon
+wherever
+whether
+which
+whichever
+while
+whilst
+whim
+whither
+who
+who'd
+who'll
+who's
+whod
+whoever
+whole
+wholl
+whom
+whomever
+whos
+whose
+why
+why'd
+why'll
+why's
+widely
+width
+will
+willing
+wish
+with
+within
+without
+won
+won't
+wonder
+wont
+words
+work
+worked
+working
+works
+world
+would
+would've
+wouldn
+wouldn't
+wouldnt
+ws
+www
+x
+y
+ye
+year
+years
+yes
+yet
+you
+you'd
+you'll
+you're
+you've
+youd
+youll
+young
+younger
+youngest
+your
+youre
+yours
+yourself
+yourselves
+youve
+yt
+yu
+z
+za
+zero
+zm
+zr
+\ No newline at end of file
diff --git a/static/stopwords/eo b/static/stopwords/eo

new file mode 100644 (file)

index 0000000..bb209f3
--- /dev/null
+++ b/static/stopwords/eo
@@ -0,0 +1,173 @@
+adiaŭ
+ajn
+al
+ankoraŭ
+antaŭ
+aŭ
+bonan
+bonvole
+bonvolu
+bv
+ci
+cia
+cian
+cin
+d-ro
+da
+de
+dek
+deka
+do
+doktor'
+doktoro
+du
+dua
+dum
+eble
+ekz
+ekzemple
+en
+estas
+estis
+estos
+estu
+estus
+eĉ
+f-no
+feliĉan
+for
+fraŭlino
+ha
+havas
+havis
+havos
+havu
+havus
+he
+ho
+hu
+ili
+ilia
+ilian
+ilin
+inter
+io
+ion
+iu
+iujn
+iun
+ja
+jam
+je
+jes
+k
+kaj
+ke
+kio
+kion
+kiu
+kiujn
+kiun
+kvankam
+kvar
+kvara
+kvazaŭ
+kvin
+kvina
+la
+li
+lia
+lian
+lin
+malantaŭ
+male
+malgraŭ
+mem
+mi
+mia
+mian
+min
+minus
+naŭ
+naŭa
+ne
+nek
+nenio
+nenion
+neniu
+neniun
+nepre
+ni
+nia
+nian
+nin
+nu
+nun
+nur
+ok
+oka
+oni
+onia
+onian
+onin
+plej
+pli
+plu
+plus
+por
+post
+preter
+s-no
+s-ro
+se
+sed
+sep
+sepa
+ses
+sesa
+si
+sia
+sian
+sin
+sinjor'
+sinjorino
+sinjoro
+sub
+super
+supren
+sur
+tamen
+tio
+tion
+tiu
+tiujn
+tiun
+tra
+tri
+tria
+tuj
+tute
+unu
+unua
+ve
+verŝajne
+vi
+via
+vian
+vin
+ĉi
+ĉio
+ĉion
+ĉiu
+ĉiujn
+ĉiun
+ĉu
+ĝi
+ĝia
+ĝian
+ĝin
+ĝis
+ĵus
+ŝi
+ŝia
+ŝin
+\ No newline at end of file
diff --git a/static/stopwords/es b/static/stopwords/es

new file mode 100644 (file)

index 0000000..0cf607d
--- /dev/null
+++ b/static/stopwords/es
@@ -0,0 +1,732 @@
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+_
+a
+actualmente
+acuerdo
+adelante
+ademas
+además
+adrede
+afirmó
+agregó
+ahi
+ahora
+ahí
+al
+algo
+alguna
+algunas
+alguno
+algunos
+algún
+alli
+allí
+alrededor
+ambos
+ampleamos
+antano
+antaño
+ante
+anterior
+antes
+apenas
+aproximadamente
+aquel
+aquella
+aquellas
+aquello
+aquellos
+aqui
+aquél
+aquélla
+aquéllas
+aquéllos
+aquí
+arriba
+arribaabajo
+aseguró
+asi
+así
+atras
+aun
+aunque
+ayer
+añadió
+aún
+b
+bajo
+bastante
+bien
+breve
+buen
+buena
+buenas
+bueno
+buenos
+c
+cada
+casi
+cerca
+cierta
+ciertas
+cierto
+ciertos
+cinco
+claro
+comentó
+como
+con
+conmigo
+conocer
+conseguimos
+conseguir
+considera
+consideró
+consigo
+consigue
+consiguen
+consigues
+contigo
+contra
+cosas
+creo
+cual
+cuales
+cualquier
+cuando
+cuanta
+cuantas
+cuanto
+cuantos
+cuatro
+cuenta
+cuál
+cuáles
+cuándo
+cuánta
+cuántas
+cuánto
+cuántos
+cómo
+d
+da
+dado
+dan
+dar
+de
+debajo
+debe
+deben
+debido
+decir
+dejó
+del
+delante
+demasiado
+demás
+dentro
+deprisa
+desde
+despacio
+despues
+después
+detras
+detrás
+dia
+dias
+dice
+dicen
+dicho
+dieron
+diferente
+diferentes
+dijeron
+dijo
+dio
+donde
+dos
+durante
+día
+días
+dónde
+e
+ejemplo
+el
+ella
+ellas
+ello
+ellos
+embargo
+empleais
+emplean
+emplear
+empleas
+empleo
+en
+encima
+encuentra
+enfrente
+enseguida
+entonces
+entre
+era
+erais
+eramos
+eran
+eras
+eres
+es
+esa
+esas
+ese
+eso
+esos
+esta
+estaba
+estabais
+estaban
+estabas
+estad
+estada
+estadas
+estado
+estados
+estais
+estamos
+estan
+estando
+estar
+estaremos
+estará
+estarán
+estarás
+estaré
+estaréis
+estaría
+estaríais
+estaríamos
+estarían
+estarías
+estas
+este
+estemos
+esto
+estos
+estoy
+estuve
+estuviera
+estuvierais
+estuvieran
+estuvieras
+estuvieron
+estuviese
+estuvieseis
+estuviesen
+estuvieses
+estuvimos
+estuviste
+estuvisteis
+estuviéramos
+estuviésemos
+estuvo
+está
+estábamos
+estáis
+están
+estás
+esté
+estéis
+estén
+estés
+ex
+excepto
+existe
+existen
+explicó
+expresó
+f
+fin
+final
+fue
+fuera
+fuerais
+fueran
+fueras
+fueron
+fuese
+fueseis
+fuesen
+fueses
+fui
+fuimos
+fuiste
+fuisteis
+fuéramos
+fuésemos
+g
+general
+gran
+grandes
+gueno
+h
+ha
+haber
+habia
+habida
+habidas
+habido
+habidos
+habiendo
+habla
+hablan
+habremos
+habrá
+habrán
+habrás
+habré
+habréis
+habría
+habríais
+habríamos
+habrían
+habrías
+habéis
+había
+habíais
+habíamos
+habían
+habías
+hace
+haceis
+hacemos
+hacen
+hacer
+hacerlo
+haces
+hacia
+haciendo
+hago
+han
+has
+hasta
+hay
+haya
+hayamos
+hayan
+hayas
+hayáis
+he
+hecho
+hemos
+hicieron
+hizo
+horas
+hoy
+hube
+hubiera
+hubierais
+hubieran
+hubieras
+hubieron
+hubiese
+hubieseis
+hubiesen
+hubieses
+hubimos
+hubiste
+hubisteis
+hubiéramos
+hubiésemos
+hubo
+i
+igual
+incluso
+indicó
+informo
+informó
+intenta
+intentais
+intentamos
+intentan
+intentar
+intentas
+intento
+ir
+j
+junto
+k
+l
+la
+lado
+largo
+las
+le
+lejos
+les
+llegó
+lleva
+llevar
+lo
+los
+luego
+lugar
+m
+mal
+manera
+manifestó
+mas
+mayor
+me
+mediante
+medio
+mejor
+mencionó
+menos
+menudo
+mi
+mia
+mias
+mientras
+mio
+mios
+mis
+misma
+mismas
+mismo
+mismos
+modo
+momento
+mucha
+muchas
+mucho
+muchos
+muy
+más
+mí
+mía
+mías
+mío
+míos
+n
+nada
+nadie
+ni
+ninguna
+ningunas
+ninguno
+ningunos
+ningún
+no
+nos
+nosotras
+nosotros
+nuestra
+nuestras
+nuestro
+nuestros
+nueva
+nuevas
+nuevo
+nuevos
+nunca
+o
+ocho
+os
+otra
+otras
+otro
+otros
+p
+pais
+para
+parece
+parte
+partir
+pasada
+pasado
+paìs
+peor
+pero
+pesar
+poca
+pocas
+poco
+pocos
+podeis
+podemos
+poder
+podria
+podriais
+podriamos
+podrian
+podrias
+podrá
+podrán
+podría
+podrían
+poner
+por
+por qué
+porque
+posible
+primer
+primera
+primero
+primeros
+principalmente
+pronto
+propia
+propias
+propio
+propios
+proximo
+próximo
+próximos
+pudo
+pueda
+puede
+pueden
+puedo
+pues
+q
+qeu
+que
+quedó
+queremos
+quien
+quienes
+quiere
+quiza
+quizas
+quizá
+quizás
+quién
+quiénes
+qué
+r
+raras
+realizado
+realizar
+realizó
+repente
+respecto
+s
+sabe
+sabeis
+sabemos
+saben
+saber
+sabes
+sal
+salvo
+se
+sea
+seamos
+sean
+seas
+segun
+segunda
+segundo
+según
+seis
+ser
+sera
+seremos
+será
+serán
+serás
+seré
+seréis
+sería
+seríais
+seríamos
+serían
+serías
+seáis
+señaló
+si
+sido
+siempre
+siendo
+siete
+sigue
+siguiente
+sin
+sino
+sobre
+sois
+sola
+solamente
+solas
+solo
+solos
+somos
+son
+soy
+soyos
+su
+supuesto
+sus
+suya
+suyas
+suyo
+suyos
+sé
+sí
+sólo
+t
+tal
+tambien
+también
+tampoco
+tan
+tanto
+tarde
+te
+temprano
+tendremos
+tendrá
+tendrán
+tendrás
+tendré
+tendréis
+tendría
+tendríais
+tendríamos
+tendrían
+tendrías
+tened
+teneis
+tenemos
+tener
+tenga
+tengamos
+tengan
+tengas
+tengo
+tengáis
+tenida
+tenidas
+tenido
+tenidos
+teniendo
+tenéis
+tenía
+teníais
+teníamos
+tenían
+tenías
+tercera
+ti
+tiempo
+tiene
+tienen
+tienes
+toda
+todas
+todavia
+todavía
+todo
+todos
+total
+trabaja
+trabajais
+trabajamos
+trabajan
+trabajar
+trabajas
+trabajo
+tras
+trata
+través
+tres
+tu
+tus
+tuve
+tuviera
+tuvierais
+tuvieran
+tuvieras
+tuvieron
+tuviese
+tuvieseis
+tuviesen
+tuvieses
+tuvimos
+tuviste
+tuvisteis
+tuviéramos
+tuviésemos
+tuvo
+tuya
+tuyas
+tuyo
+tuyos
+tú
+u
+ultimo
+un
+una
+unas
+uno
+unos
+usa
+usais
+usamos
+usan
+usar
+usas
+uso
+usted
+ustedes
+v
+va
+vais
+valor
+vamos
+van
+varias
+varios
+vaya
+veces
+ver
+verdad
+verdadera
+verdadero
+vez
+vosotras
+vosotros
+voy
+vuestra
+vuestras
+vuestro
+vuestros
+w
+x
+y
+ya
+yo
+z
+él
+éramos
+ésa
+ésas
+ése
+ésos
+ésta
+éstas
+éste
+éstos
+última
+últimas
+último
+últimos
+\ No newline at end of file
diff --git a/static/stopwords/et b/static/stopwords/et

new file mode 100644 (file)

index 0000000..0914094
--- /dev/null
+++ b/static/stopwords/et
@@ -0,0 +1,35 @@
+aga
+ei
+et
+ja
+jah
+kas
+kui
+kõik
+ma
+me
+mida
+midagi
+mind
+minu
+mis
+mu
+mul
+mulle
+nad
+nii
+oled
+olen
+oli
+oma
+on
+pole
+sa
+seda
+see
+selle
+siin
+siis
+ta
+te
+ära
+\ No newline at end of file
diff --git a/static/stopwords/eu b/static/stopwords/eu

new file mode 100644 (file)

index 0000000..ded509c
--- /dev/null
+++ b/static/stopwords/eu
@@ -0,0 +1,98 @@
+al
+anitz
+arabera
+asko
+baina
+bat
+batean
+batek
+bati
+batzuei
+batzuek
+batzuetan
+batzuk
+bera
+beraiek
+berau
+berauek
+bere
+berori
+beroriek
+beste
+bezala
+da
+dago
+dira
+ditu
+du
+dute
+edo
+egin
+ere
+eta
+eurak
+ez
+gainera
+gu
+gutxi
+guzti
+haiei
+haiek
+haietan
+hainbeste
+hala
+han
+handik
+hango
+hara
+hari
+hark
+hartan
+hau
+hauei
+hauek
+hauetan
+hemen
+hemendik
+hemengo
+hi
+hona
+honek
+honela
+honetan
+honi
+hor
+hori
+horiei
+horiek
+horietan
+horko
+horra
+horrek
+horrela
+horretan
+horri
+hortik
+hura
+izan
+ni
+noiz
+nola
+non
+nondik
+nongo
+nor
+nora
+ze
+zein
+zen
+zenbait
+zenbat
+zer
+zergatik
+ziren
+zituen
+zu
+zuek
+zuen
+zuten
+\ No newline at end of file
diff --git a/static/stopwords/fa b/static/stopwords/fa

new file mode 100644 (file)

index 0000000..4df893c
--- /dev/null
+++ b/static/stopwords/fa
@@ -0,0 +1,799 @@
+!
+,
+.
+:
+;
+،
+؛
+؟
+آباد
+آره
+آری
+آمد
+آمده
+آن
+آنان
+آنجا
+آنطور
+آنقدر
+آنكه
+آنها
+آنچه
+آنکه
+آورد
+آورده
+آيد
+آی
+آیا
+آیند
+اتفاقا
+اثرِ
+احتراما
+احتمالا
+اخیر
+اری
+از
+ازجمله
+اساسا
+است
+استفاد
+استفاده
+اش
+اشکارا
+اصلا
+اصولا
+اعلام
+اغلب
+اكنون
+الان
+البته
+البتّه
+ام
+اما
+امروز
+امروزه
+امسال
+امشب
+امور
+ان
+انجام
+اند
+انشاالله
+انصافا
+انطور
+انقدر
+انها
+انچنان
+انکه
+انگار
+او
+اول
+اولا
+اي
+ايشان
+ايم
+اين
+اينكه
+اکثرا
+اکنون
+اگر
+ای
+ایا
+اید
+ایشان
+ایم
+این
+اینجا
+ایند
+اینطور
+اینقدر
+اینها
+اینچنین
+اینک
+اینکه
+اینگونه
+با
+بار
+بارة
+باره
+بارها
+باز
+بازهم
+باش
+باشد
+باشم
+باشند
+باشيم
+باشی
+باشید
+باشیم
+بالا
+بالاخره
+بالایِ
+بالطبع
+بايد
+باید
+بتوان
+بتواند
+بتوانی
+بتوانیم
+بخش
+بخشی
+بخواه
+بخواهد
+بخواهم
+بخواهند
+بخواهی
+بخواهید
+بخواهیم
+بد
+بدون
+بر
+برابر
+برابرِ
+براحتی
+براساس
+براستی
+براي
+برای
+برایِ
+برخوردار
+برخي
+برخی
+برداري
+برعکس
+بروز
+بزرگ
+بزودی
+بسا
+بسيار
+بسياري
+بسیار
+بسیاری
+بطور
+بعد
+بعدا
+بعدها
+بعری
+بعضا
+بعضي
+بلافاصله
+بلكه
+بله
+بلکه
+بلی
+بنابراين
+بنابراین
+بندي
+به
+بهتر
+بهترين
+بود
+بودم
+بودن
+بودند
+بوده
+بودی
+بودید
+بودیم
+بویژه
+بي
+بيست
+بيش
+بيشتر
+بيشتري
+بين
+بکن
+بکند
+بکنم
+بکنند
+بکنی
+بکنید
+بکنیم
+بگو
+بگوید
+بگویم
+بگویند
+بگویی
+بگویید
+بگوییم
+بگیر
+بگیرد
+بگیرم
+بگیرند
+بگیری
+بگیرید
+بگیریم
+بی
+بیا
+بیاب
+بیابد
+بیابم
+بیابند
+بیابی
+بیابید
+بیابیم
+بیاور
+بیاورد
+بیاورم
+بیاورند
+بیاوری
+بیاورید
+بیاوریم
+بیاید
+بیایم
+بیایند
+بیایی
+بیایید
+بیاییم
+بیرون
+بیرونِ
+بیش
+بیشتر
+بیشتری
+بین
+ت
+تا
+تازه
+تاكنون
+تان
+تاکنون
+تحت
+تر
+تر  براساس
+ترين
+تقریبا
+تلویحا
+تمام
+تماما
+تمامي
+تنها
+تو
+تواند
+توانست
+توانستم
+توانستن
+توانستند
+توانسته
+توانستی
+توانستیم
+توانم
+توانند
+توانی
+توانید
+توانیم
+توسط
+تولِ
+تویِ
+ثانیا
+جا
+جاي
+جايي
+جای
+جدا
+جديد
+جدید
+جريان
+جریان
+جز
+جلوگيري
+جلویِ
+جمعا
+جناح
+جهت
+حاضر
+حال
+حالا
+حتما
+حتي
+حتی
+حداکثر
+حدودا
+حدودِ
+حق
+خارجِ
+خب
+خدمات
+خصوصا
+خلاصه
+خواست
+خواستم
+خواستن
+خواستند
+خواسته
+خواستی
+خواستید
+خواستیم
+خواهد
+خواهم
+خواهند
+خواهيم
+خواهی
+خواهید
+خواهیم
+خوب
+خود
+خودت
+خودتان
+خودش
+خودشان
+خودم
+خودمان
+خوشبختانه
+خويش
+خویش
+خویشتن
+خیاه
+خیر
+خیلی
+داد
+دادم
+دادن
+دادند
+داده
+دادی
+دادید
+دادیم
+دار
+دارد
+دارم
+دارند
+داريم
+داری
+دارید
+داریم
+داشت
+داشتم
+داشتن
+داشتند
+داشته
+داشتی
+داشتید
+داشتیم
+دانست
+دانند
+دایم
+دایما
+در
+درباره
+درمجموع
+درون
+دریغ
+دقیقا
+دنبالِ
+ده
+دهد
+دهم
+دهند
+دهی
+دهید
+دهیم
+دو
+دوباره
+دوم
+ديده
+ديروز
+ديگر
+ديگران
+ديگري
+دیر
+دیروز
+دیگر
+دیگران
+دیگری
+را
+راحت
+راسا
+راستی
+راه
+رسما
+رسید
+رفت
+رفته
+رو
+روب
+روز
+روزانه
+روزهاي
+روي
+روی
+رویِ
+ريزي
+زمان
+زمانی
+زمینه
+زود
+زياد
+زير
+زيرا
+زیر
+زیرِ
+سابق
+ساخته
+سازي
+سالانه
+سالیانه
+سایر
+سراسر
+سرانجام
+سریعا
+سریِ
+سعي
+سمتِ
+سوم
+سوي
+سوی
+سویِ
+سپس
+شان
+شايد
+شاید
+شخصا
+شد
+شدم
+شدن
+شدند
+شده
+شدی
+شدید
+شدیدا
+شدیم
+شش
+شش  نداشته
+شما
+شناسي
+شود
+شوم
+شوند
+شونده
+شوی
+شوید
+شویم
+صرفا
+صورت
+ضدِّ
+ضدِّ
+ضمن
+طبعا
+طبقِ
+طبیعتا
+طرف
+طريق
+طریق
+طور
+طي
+طی
+ظاهرا
+عدم
+عقبِ
+علّتِ
+علیه
+عمدا
+عمدتا
+عمل
+عملا
+عنوان
+عنوانِ
+غالبا
+غير
+غیر
+فردا
+فعلا
+فقط
+فكر
+فوق
+قابل
+قبل
+قبلا
+قدری
+قصدِ
+قطعا
+كرد
+كردم
+كردن
+كردند
+كرده
+كسي
+كل
+كمتر
+كند
+كنم
+كنند
+كنيد
+كنيم
+كه
+لااقل
+لطفا
+لطفاً
+ما
+مان
+مانند
+مانندِ
+مبادا
+متاسفانه
+متعاقبا
+مثل
+مثلا
+مثلِ
+مجانی
+مجددا
+مجموعا
+مختلف
+مدام
+مدت
+مدّتی
+مردم
+مرسی
+مستقیما
+مسلما
+مطمینا
+معمولا
+مقابل
+ممکن
+من
+موارد
+مورد
+موقتا
+مي
+ميليارد
+ميليون
+مگر
+می
+می شود
+میان
+می‌رسد
+می‌رود
+می‌شود
+می‌کنیم
+ناشي
+نام
+ناگاه
+ناگهان
+ناگهانی
+نبايد
+نباید
+نبود
+نخست
+نخستين
+نخواهد
+نخواهم
+نخواهند
+نخواهی
+نخواهید
+نخواهیم
+ندارد
+ندارم
+ندارند
+نداری
+ندارید
+نداریم
+نداشت
+نداشتم
+نداشتند
+نداشته
+نداشتی
+نداشتید
+نداشتیم
+نزديك
+نزدِ
+نزدیکِ
+نسبتا
+نشان
+نشده
+نظير
+نظیر
+نكرده
+نمايد
+نمي
+نمی
+نمی‌شود
+نه
+نهایتا
+نوع
+نوعي
+نوعی
+نيز
+نيست
+نگاه
+نیز
+نیست
+ها
+هاي
+هايي
+های
+هایی
+هبچ
+هر
+هرچه
+هرگز
+هزار
+هست
+هستم
+هستند
+هستيم
+هستی
+هستید
+هستیم
+هفت
+هم
+همان
+همه
+همواره
+همين
+همچنان
+همچنين
+همچنین
+همچون
+همیشه
+همین
+هنوز
+هنگام
+هنگامِ
+هنگامی
+هيچ
+هیچ
+هیچگاه
+و
+واقعا
+واقعی
+وجود
+وسطِ
+وضع
+وقتي
+وقتی
+وقتیکه
+ولی
+وي
+وگو
+وی
+ویژه
+يا
+يابد
+يك
+يكديگر
+يكي
+ّه
+٪
+پارسال
+پاعینِ
+پس
+پنج
+پيش
+پیدا
+پیش
+پیشاپیش
+پیشتر
+پیشِ
+چرا
+چطور
+چقدر
+چنان
+چنانچه
+چنانکه
+چند
+چندین
+چنين
+چنین
+چه
+چهار
+چو
+چون
+چيزي
+چگونه
+چیز
+چیزی
+چیست
+کاش
+کامل
+کاملا
+کتبا
+کجا
+کجاست
+کدام
+کرد
+کردم
+کردن
+کردند
+کرده
+کردی
+کردید
+کردیم
+کس
+کسانی
+کسی
+کل
+کلا
+کم
+کماکان
+کمتر
+کمتری
+کمی
+کن
+کنار
+کنارِ
+کند
+کنم
+کنند
+کننده
+کنون
+کنونی
+کنی
+کنید
+کنیم
+که
+کو
+کَی
+کی
+گاه
+گاهی
+گذاري
+گذاشته
+گذشته
+گردد
+گرفت
+گرفتم
+گرفتن
+گرفتند
+گرفته
+گرفتی
+گرفتید
+گرفتیم
+گروهي
+گفت
+گفتم
+گفتن
+گفتند
+گفته
+گفتی
+گفتید
+گفتیم
+گه
+گهگاه
+گو
+گويد
+گويند
+گویا
+گوید
+گویم
+گویند
+گویی
+گویید
+گوییم
+گيرد
+گيري
+گیرد
+گیرم
+گیرند
+گیری
+گیرید
+گیریم
+ی
+یا
+یابد
+یابم
+یابند
+یابی
+یابید
+یابیم
+یافت
+یافتم
+یافتن
+یافته
+یافتی
+یافتید
+یافتیم
+یعنی
+یقینا
+یه
+یک
+یکی
+۰
+۱
+۲
+۳
+۴
+۵
+۶
+۷
+۸
+۹
+\ No newline at end of file
diff --git a/static/stopwords/fi b/static/stopwords/fi

new file mode 100644 (file)

index 0000000..84f0006
--- /dev/null
+++ b/static/stopwords/fi
@@ -0,0 +1,847 @@
+aiemmin
+aika
+aikaa
+aikaan
+aikaisemmin
+aikaisin
+aikajen
+aikana
+aikoina
+aikoo
+aikovat
+aina
+ainakaan
+ainakin
+ainoa
+ainoat
+aiomme
+aion
+aiotte
+aist
+aivan
+ajan
+alas
+alemmas
+alkuisin
+alkuun
+alla
+alle
+aloitamme
+aloitan
+aloitat
+aloitatte
+aloitattivat
+aloitettava
+aloitettevaksi
+aloitettu
+aloitimme
+aloitin
+aloitit
+aloititte
+aloittaa
+aloittamatta
+aloitti
+aloittivat
+alta
+aluksi
+alussa
+alusta
+annettavaksi
+annetteva
+annettu
+ansiosta
+antaa
+antamatta
+antoi
+aoua
+apu
+asia
+asiaa
+asian
+asiasta
+asiat
+asioiden
+asioihin
+asioita
+asti
+avuksi
+avulla
+avun
+avutta
+edelle
+edelleen
+edellä
+edeltä
+edemmäs
+edes
+edessä
+edestä
+ehkä
+ei
+eikä
+eilen
+eivät
+eli
+ellei
+elleivät
+ellemme
+ellen
+ellet
+ellette
+emme
+en
+enemmän
+eniten
+ennen
+ensi
+ensimmäinen
+ensimmäiseksi
+ensimmäisen
+ensimmäisenä
+ensimmäiset
+ensimmäisiksi
+ensimmäisinä
+ensimmäisiä
+ensimmäistä
+ensin
+entinen
+entisen
+entisiä
+entisten
+entistä
+enää
+eri
+erittäin
+erityisesti
+eräiden
+eräs
+eräät
+esi
+esiin
+esillä
+esimerkiksi
+et
+eteen
+etenkin
+etessa
+ette
+ettei
+että
+haikki
+halua
+haluaa
+haluamatta
+haluamme
+haluan
+haluat
+haluatte
+haluavat
+halunnut
+halusi
+halusimme
+halusin
+halusit
+halusitte
+halusivat
+halutessa
+haluton
+he
+hei
+heidän
+heidät
+heihin
+heille
+heillä
+heiltä
+heissä
+heistä
+heitä
+helposti
+heti
+hetkellä
+hieman
+hitaasti
+hoikein
+huolimatta
+huomenna
+hyvien
+hyviin
+hyviksi
+hyville
+hyviltä
+hyvin
+hyvinä
+hyvissä
+hyvistä
+hyviä
+hyvä
+hyvät
+hyvää
+hän
+häneen
+hänelle
+hänellä
+häneltä
+hänen
+hänessä
+hänestä
+hänet
+häntä
+ihan
+ilman
+ilmeisesti
+itse
+itsensä
+itseään
+ja
+jo
+johon
+joiden
+joihin
+joiksi
+joilla
+joille
+joilta
+joina
+joissa
+joista
+joita
+joka
+jokainen
+jokin
+joko
+joksi
+joku
+jolla
+jolle
+jolloin
+jolta
+jompikumpi
+jona
+jonka
+jonkin
+jonne
+joo
+jopa
+jos
+joskus
+jossa
+josta
+jota
+jotain
+joten
+jotenkin
+jotenkuten
+jotka
+jotta
+jouduimme
+jouduin
+jouduit
+jouduitte
+joudumme
+joudun
+joudutte
+joukkoon
+joukossa
+joukosta
+joutua
+joutui
+joutuivat
+joutumaan
+joutuu
+joutuvat
+juuri
+jälkeen
+jälleen
+jää
+kahdeksan
+kahdeksannen
+kahdella
+kahdelle
+kahdelta
+kahden
+kahdessa
+kahdesta
+kahta
+kahteen
+kai
+kaiken
+kaikille
+kaikilta
+kaikkea
+kaikki
+kaikkia
+kaikkiaan
+kaikkialla
+kaikkialle
+kaikkialta
+kaikkien
+kaikkin
+kaksi
+kannalta
+kannattaa
+kanssa
+kanssaan
+kanssamme
+kanssani
+kanssanne
+kanssasi
+kauan
+kauemmas
+kaukana
+kautta
+kehen
+keiden
+keihin
+keiksi
+keille
+keillä
+keiltä
+keinä
+keissä
+keistä
+keitten
+keittä
+keitä
+keneen
+keneksi
+kenelle
+kenellä
+keneltä
+kenen
+kenenä
+kenessä
+kenestä
+kenet
+kenettä
+kennessästä
+kenties
+kerran
+kerta
+kertaa
+keskellä
+kesken
+keskimäärin
+ketkä
+ketä
+kiitos
+kohti
+koko
+kokonaan
+kolmas
+kolme
+kolmen
+kolmesti
+koska
+koskaan
+kovin
+kuin
+kuinka
+kuinkan
+kuitenkaan
+kuitenkin
+kuka
+kukaan
+kukin
+kukka
+kumpainen
+kumpainenkaan
+kumpi
+kumpikaan
+kumpikin
+kun
+kuten
+kuuden
+kuusi
+kuutta
+kylliksi
+kyllä
+kymmenen
+kyse
+liian
+liki
+lisäksi
+lisää
+lla
+luo
+luona
+lähekkäin
+lähelle
+lähellä
+läheltä
+lähemmäs
+lähes
+lähinnä
+lähtien
+läpi
+mahdollisimman
+mahdollista
+me
+meidän
+meidät
+meihin
+meille
+meillä
+meiltä
+meissä
+meistä
+meitä
+melkein
+melko
+menee
+meneet
+menemme
+menen
+menet
+menette
+menevät
+meni
+menimme
+menin
+menit
+menivät
+mennessä
+mennyt
+menossa
+mihin
+mikin
+miksi
+mikä
+mikäli
+mikään
+mille
+milloin
+milloinkan
+millä
+miltä
+minkä
+minne
+minua
+minulla
+minulle
+minulta
+minun
+minussa
+minusta
+minut
+minuun
+minä
+missä
+mistä
+miten
+mitkä
+mitä
+mitään
+moi
+molemmat
+mones
+monesti
+monet
+moni
+moniaalla
+moniaalle
+moniaalta
+monta
+muassa
+muiden
+muita
+muka
+mukaan
+mukaansa
+mukana
+mutta
+muu
+muualla
+muualle
+muualta
+muuanne
+muulloin
+muun
+muut
+muuta
+muutama
+muutaman
+muuten
+myöhemmin
+myös
+myöskin
+myöskään
+myötä
+ne
+neljä
+neljän
+neljää
+niiden
+niihin
+niiksi
+niille
+niillä
+niiltä
+niin
+niinä
+niissä
+niistä
+niitä
+noiden
+noihin
+noiksi
+noilla
+noille
+noilta
+noin
+noina
+noissa
+noista
+noita
+nopeammin
+nopeasti
+nopeiten
+nro
+nuo
+nyt
+näiden
+näihin
+näiksi
+näille
+näillä
+näiltä
+näin
+näinä
+näissä
+näissähin
+näissälle
+näissältä
+näissästä
+näistä
+näitä
+nämä
+ohi
+oikea
+oikealla
+oikein
+ole
+olemme
+olen
+olet
+olette
+oleva
+olevan
+olevat
+oli
+olimme
+olin
+olisi
+olisimme
+olisin
+olisit
+olisitte
+olisivat
+olit
+olitte
+olivat
+olla
+olleet
+olli
+ollut
+oma
+omaa
+omaan
+omaksi
+omalle
+omalta
+oman
+omassa
+omat
+omia
+omien
+omiin
+omiksi
+omille
+omilta
+omissa
+omista
+on
+onkin
+onko
+ovat
+paikoittain
+paitsi
+pakosti
+paljon
+paremmin
+parempi
+parhaillaan
+parhaiten
+perusteella
+peräti
+pian
+pieneen
+pieneksi
+pienelle
+pienellä
+pieneltä
+pienempi
+pienestä
+pieni
+pienin
+poikki
+puolesta
+puolestaan
+päälle
+runsaasti
+saakka
+sadam
+sama
+samaa
+samaan
+samalla
+samallalta
+samallassa
+samallasta
+saman
+samat
+samoin
+sata
+sataa
+satojen
+se
+seitsemän
+sekä
+sen
+seuraavat
+siellä
+sieltä
+siihen
+siinä
+siis
+siitä
+sijaan
+siksi
+sille
+silloin
+sillä
+silti
+siltä
+sinne
+sinua
+sinulla
+sinulle
+sinulta
+sinun
+sinussa
+sinusta
+sinut
+sinuun
+sinä
+sisäkkäin
+sisällä
+siten
+sitten
+sitä
+ssa
+sta
+suoraan
+suuntaan
+suuren
+suuret
+suuri
+suuria
+suurin
+suurten
+taa
+taas
+taemmas
+tahansa
+tai
+takaa
+takaisin
+takana
+takia
+tallä
+tapauksessa
+tarpeeksi
+tavalla
+tavoitteena
+te
+teidän
+teidät
+teihin
+teille
+teillä
+teiltä
+teissä
+teistä
+teitä
+tietysti
+todella
+toinen
+toisaalla
+toisaalle
+toisaalta
+toiseen
+toiseksi
+toisella
+toiselle
+toiselta
+toisemme
+toisen
+toisensa
+toisessa
+toisesta
+toista
+toistaiseksi
+toki
+tosin
+tuhannen
+tuhat
+tule
+tulee
+tulemme
+tulen
+tulet
+tulette
+tulevat
+tulimme
+tulin
+tulisi
+tulisimme
+tulisin
+tulisit
+tulisitte
+tulisivat
+tulit
+tulitte
+tulivat
+tulla
+tulleet
+tullut
+tuntuu
+tuo
+tuohon
+tuoksi
+tuolla
+tuolle
+tuolloin
+tuolta
+tuon
+tuona
+tuonne
+tuossa
+tuosta
+tuota
+tuotä
+tuskin
+tykö
+tähän
+täksi
+tälle
+tällä
+tällöin
+tältä
+tämä
+tämän
+tänne
+tänä
+tänään
+tässä
+tästä
+täten
+tätä
+täysin
+täytyvät
+täytyy
+täällä
+täältä
+ulkopuolella
+usea
+useasti
+useimmiten
+usein
+useita
+uudeksi
+uudelleen
+uuden
+uudet
+uusi
+uusia
+uusien
+uusinta
+uuteen
+uutta
+vaan
+vahemmän
+vai
+vaiheessa
+vaikea
+vaikean
+vaikeat
+vaikeilla
+vaikeille
+vaikeilta
+vaikeissa
+vaikeista
+vaikka
+vain
+varmasti
+varsin
+varsinkin
+varten
+vasen
+vasenmalla
+vasta
+vastaan
+vastakkain
+vastan
+verran
+vielä
+vierekkäin
+vieressä
+vieri
+viiden
+viime
+viimeinen
+viimeisen
+viimeksi
+viisi
+voi
+voidaan
+voimme
+voin
+voisi
+voit
+voitte
+voivat
+vuoden
+vuoksi
+vuosi
+vuosien
+vuosina
+vuotta
+vähemmän
+vähintään
+vähiten
+vähän
+välillä
+yhdeksän
+yhden
+yhdessä
+yhteen
+yhteensä
+yhteydessä
+yhteyteen
+yhtä
+yhtäälle
+yhtäällä
+yhtäältä
+yhtään
+yhä
+yksi
+yksin
+yksittäin
+yleensä
+ylemmäs
+yli
+ylös
+ympäri
+älköön
+älä
+\ No newline at end of file
diff --git a/static/stopwords/fr b/static/stopwords/fr

new file mode 100644 (file)

index 0000000..0e2789f
--- /dev/null
+++ b/static/stopwords/fr
@@ -0,0 +1,689 @@
+a
+abord
+absolument
+afin
+ah
+ai
+aie
+aient
+aies
+ailleurs
+ainsi
+ait
+allaient
+allo
+allons
+allô
+alors
+anterieur
+anterieure
+anterieures
+apres
+après
+as
+assez
+attendu
+au
+aucun
+aucune
+aucuns
+aujourd
+aujourd'hui
+aupres
+auquel
+aura
+aurai
+auraient
+aurais
+aurait
+auras
+aurez
+auriez
+aurions
+aurons
+auront
+aussi
+autre
+autrefois
+autrement
+autres
+autrui
+aux
+auxquelles
+auxquels
+avaient
+avais
+avait
+avant
+avec
+avez
+aviez
+avions
+avoir
+avons
+ayant
+ayez
+ayons
+b
+bah
+bas
+basee
+bat
+beau
+beaucoup
+bien
+bigre
+bon
+boum
+bravo
+brrr
+c
+car
+ce
+ceci
+cela
+celle
+celle-ci
+celle-là
+celles
+celles-ci
+celles-là
+celui
+celui-ci
+celui-là
+celà
+cent
+cependant
+certain
+certaine
+certaines
+certains
+certes
+ces
+cet
+cette
+ceux
+ceux-ci
+ceux-là
+chacun
+chacune
+chaque
+cher
+chers
+chez
+chiche
+chut
+chère
+chères
+ci
+cinq
+cinquantaine
+cinquante
+cinquantième
+cinquième
+clac
+clic
+combien
+comme
+comment
+comparable
+comparables
+compris
+concernant
+contre
+couic
+crac
+d
+da
+dans
+de
+debout
+dedans
+dehors
+deja
+delà
+depuis
+dernier
+derniere
+derriere
+derrière
+des
+desormais
+desquelles
+desquels
+dessous
+dessus
+deux
+deuxième
+deuxièmement
+devant
+devers
+devra
+devrait
+different
+differentes
+differents
+différent
+différente
+différentes
+différents
+dire
+directe
+directement
+dit
+dite
+dits
+divers
+diverse
+diverses
+dix
+dix-huit
+dix-neuf
+dix-sept
+dixième
+doit
+doivent
+donc
+dont
+dos
+douze
+douzième
+dring
+droite
+du
+duquel
+durant
+dès
+début
+désormais
+e
+effet
+egale
+egalement
+egales
+eh
+elle
+elle-même
+elles
+elles-mêmes
+en
+encore
+enfin
+entre
+envers
+environ
+es
+essai
+est
+et
+etant
+etc
+etre
+eu
+eue
+eues
+euh
+eurent
+eus
+eusse
+eussent
+eusses
+eussiez
+eussions
+eut
+eux
+eux-mêmes
+exactement
+excepté
+extenso
+exterieur
+eûmes
+eût
+eûtes
+f
+fais
+faisaient
+faisant
+fait
+faites
+façon
+feront
+fi
+flac
+floc
+fois
+font
+force
+furent
+fus
+fusse
+fussent
+fusses
+fussiez
+fussions
+fut
+fûmes
+fût
+fûtes
+g
+gens
+h
+ha
+haut
+hein
+hem
+hep
+hi
+ho
+holà
+hop
+hormis
+hors
+hou
+houp
+hue
+hui
+huit
+huitième
+hum
+hurrah
+hé
+hélas
+i
+ici
+il
+ils
+importe
+j
+je
+jusqu
+jusque
+juste
+k
+l
+la
+laisser
+laquelle
+las
+le
+lequel
+les
+lesquelles
+lesquels
+leur
+leurs
+longtemps
+lors
+lorsque
+lui
+lui-meme
+lui-même
+là
+lès
+m
+ma
+maint
+maintenant
+mais
+malgre
+malgré
+maximale
+me
+meme
+memes
+merci
+mes
+mien
+mienne
+miennes
+miens
+mille
+mince
+mine
+minimale
+moi
+moi-meme
+moi-même
+moindres
+moins
+mon
+mot
+moyennant
+multiple
+multiples
+même
+mêmes
+n
+na
+naturel
+naturelle
+naturelles
+ne
+neanmoins
+necessaire
+necessairement
+neuf
+neuvième
+ni
+nombreuses
+nombreux
+nommés
+non
+nos
+notamment
+notre
+nous
+nous-mêmes
+nouveau
+nouveaux
+nul
+néanmoins
+nôtre
+nôtres
+o
+oh
+ohé
+ollé
+olé
+on
+ont
+onze
+onzième
+ore
+ou
+ouf
+ouias
+oust
+ouste
+outre
+ouvert
+ouverte
+ouverts
+o|
+où
+p
+paf
+pan
+par
+parce
+parfois
+parle
+parlent
+parler
+parmi
+parole
+parseme
+partant
+particulier
+particulière
+particulièrement
+pas
+passé
+pendant
+pense
+permet
+personne
+personnes
+peu
+peut
+peuvent
+peux
+pff
+pfft
+pfut
+pif
+pire
+pièce
+plein
+plouf
+plupart
+plus
+plusieurs
+plutôt
+possessif
+possessifs
+possible
+possibles
+pouah
+pour
+pourquoi
+pourrais
+pourrait
+pouvait
+prealable
+precisement
+premier
+première
+premièrement
+pres
+probable
+probante
+procedant
+proche
+près
+psitt
+pu
+puis
+puisque
+pur
+pure
+q
+qu
+quand
+quant
+quant-à-soi
+quanta
+quarante
+quatorze
+quatre
+quatre-vingt
+quatrième
+quatrièmement
+que
+quel
+quelconque
+quelle
+quelles
+quelqu'un
+quelque
+quelques
+quels
+qui
+quiconque
+quinze
+quoi
+quoique
+r
+rare
+rarement
+rares
+relative
+relativement
+remarquable
+rend
+rendre
+restant
+reste
+restent
+restrictif
+retour
+revoici
+revoilà
+rien
+s
+sa
+sacrebleu
+sait
+sans
+sapristi
+sauf
+se
+sein
+seize
+selon
+semblable
+semblaient
+semble
+semblent
+sent
+sept
+septième
+sera
+serai
+seraient
+serais
+serait
+seras
+serez
+seriez
+serions
+serons
+seront
+ses
+seul
+seule
+seulement
+si
+sien
+sienne
+siennes
+siens
+sinon
+six
+sixième
+soi
+soi-même
+soient
+sois
+soit
+soixante
+sommes
+son
+sont
+sous
+souvent
+soyez
+soyons
+specifique
+specifiques
+speculatif
+stop
+strictement
+subtiles
+suffisant
+suffisante
+suffit
+suis
+suit
+suivant
+suivante
+suivantes
+suivants
+suivre
+sujet
+superpose
+sur
+surtout
+t
+ta
+tac
+tandis
+tant
+tardive
+te
+tel
+telle
+tellement
+telles
+tels
+tenant
+tend
+tenir
+tente
+tes
+tic
+tien
+tienne
+tiennes
+tiens
+toc
+toi
+toi-même
+ton
+touchant
+toujours
+tous
+tout
+toute
+toutefois
+toutes
+treize
+trente
+tres
+trois
+troisième
+troisièmement
+trop
+très
+tsoin
+tsouin
+tu
+té
+u
+un
+une
+unes
+uniformement
+unique
+uniques
+uns
+v
+va
+vais
+valeur
+vas
+vers
+via
+vif
+vifs
+vingt
+vivat
+vive
+vives
+vlan
+voici
+voie
+voient
+voilà
+vont
+vos
+votre
+vous
+vous-mêmes
+vu
+vé
+vôtre
+vôtres
+w
+x
+y
+z
+zut
+à
+â
+ça
+ès
+étaient
+étais
+était
+étant
+état
+étiez
+étions
+été
+étée
+étées
+étés
+êtes
+être
+ô
+\ No newline at end of file
diff --git a/static/stopwords/ga b/static/stopwords/ga

new file mode 100644 (file)

index 0000000..5ad466d
--- /dev/null
+++ b/static/stopwords/ga
@@ -0,0 +1,109 @@
+a
+ach
+ag
+agus
+an
+aon
+ar
+arna
+as
+b'
+ba
+beirt
+bhúr
+caoga
+ceathair
+ceathrar
+chomh
+chtó
+chuig
+chun
+cois
+céad
+cúig
+cúigear
+d'
+daichead
+dar
+de
+deich
+deichniúr
+den
+dhá
+do
+don
+dtí
+dá
+dár
+dó
+faoi
+faoin
+faoina
+faoinár
+fara
+fiche
+gach
+gan
+go
+gur
+haon
+hocht
+i
+iad
+idir
+in
+ina
+ins
+inár
+is
+le
+leis
+lena
+lenár
+m'
+mar
+mo
+mé
+na
+nach
+naoi
+naonúr
+ná
+ní
+níor
+nó
+nócha
+ocht
+ochtar
+os
+roimh
+sa
+seacht
+seachtar
+seachtó
+seasca
+seisear
+siad
+sibh
+sinn
+sna
+sé
+sí
+tar
+thar
+thú
+triúr
+trí
+trína
+trínár
+tríocha
+tú
+um
+ár
+é
+éis
+í
+ó
+ón
+óna
+ónár
+\ No newline at end of file
diff --git a/static/stopwords/gl b/static/stopwords/gl

new file mode 100644 (file)

index 0000000..c5baac0
--- /dev/null
+++ b/static/stopwords/gl
@@ -0,0 +1,160 @@
+a
+alí
+ao
+aos
+aquel
+aquela
+aquelas
+aqueles
+aquilo
+aquí
+as
+así
+aínda
+ben
+cando
+che
+co
+coa
+coas
+comigo
+con
+connosco
+contigo
+convosco
+cos
+cun
+cunha
+cunhas
+cuns
+da
+dalgunha
+dalgunhas
+dalgún
+dalgúns
+das
+de
+del
+dela
+delas
+deles
+desde
+deste
+do
+dos
+dun
+dunha
+dunhas
+duns
+e
+el
+ela
+elas
+eles
+en
+era
+eran
+esa
+esas
+ese
+eses
+esta
+estaba
+estar
+este
+estes
+estiven
+estou
+está
+están
+eu
+facer
+foi
+foron
+fun
+había
+hai
+iso
+isto
+la
+las
+lle
+lles
+lo
+los
+mais
+me
+meu
+meus
+min
+miña
+miñas
+moi
+na
+nas
+neste
+nin
+no
+non
+nos
+nosa
+nosas
+noso
+nosos
+nun
+nunha
+nunhas
+nuns
+nós
+o
+os
+ou
+para
+pero
+pode
+pois
+pola
+polas
+polo
+polos
+por
+que
+se
+senón
+ser
+seu
+seus
+sexa
+sido
+sobre
+súa
+súas
+tamén
+tan
+te
+ten
+ter
+teu
+teus
+teñen
+teño
+ti
+tido
+tiven
+tiña
+túa
+túas
+un
+unha
+unhas
+uns
+vos
+vosa
+vosas
+voso
+vosos
+vós
+á
+é
+ó
+ós
+\ No newline at end of file
diff --git a/static/stopwords/ha b/static/stopwords/ha

new file mode 100644 (file)

index 0000000..dce823d
--- /dev/null
+++ b/static/stopwords/ha
@@ -0,0 +1,39 @@
+a
+amma
+ba
+ban
+ce
+cikin
+da
+don
+ga
+in
+ina
+ita
+ji
+ka
+ko
+kuma
+lokacin
+ma
+mai
+na
+ne
+ni
+sai
+shi
+su
+suka
+sun
+ta
+tafi
+take
+tana
+wani
+wannan
+wata
+ya
+yake
+yana
+yi
+za
+\ No newline at end of file
diff --git a/static/stopwords/he b/static/stopwords/he

new file mode 100644 (file)

index 0000000..5f345f3
--- /dev/null
+++ b/static/stopwords/he
@@ -0,0 +1,194 @@
+אבל
+או
+אולי
+אותה
+אותו
+אותי
+אותך
+אותם
+אותן
+אותנו
+אז
+אחר
+אחרות
+אחרי
+אחריכן
+אחרים
+אחרת
+אי
+איזה
+איך
+אין
+איפה
+איתה
+איתו
+איתי
+איתך
+איתכם
+איתכן
+איתם
+איתן
+איתנו
+אך
+אל
+אלה
+אלו
+אם
+אנחנו
+אני
+אס
+אף
+אצל
+אשר
+את
+אתה
+אתכם
+אתכן
+אתם
+אתן
+באיזומידה
+באמצע
+באמצעות
+בגלל
+בין
+בלי
+במידה
+במקוםשבו
+ברם
+בשביל
+בשעהש
+בתוך
+גם
+דרך
+הוא
+היא
+היה
+היכן
+היתה
+היתי
+הם
+הן
+הנה
+הסיבהשבגללה
+הרי
+ואילו
+ואת
+זאת
+זה
+זות
+יהיה
+יוכל
+יוכלו
+יותרמדי
+יכול
+יכולה
+יכולות
+יכולים
+יכל
+יכלה
+יכלו
+יש
+כאן
+כאשר
+כולם
+כולן
+כזה
+כי
+כיצד
+כך
+ככה
+כל
+כלל
+כמו
+כן
+כפי
+כש
+לא
+לאו
+לאיזותכלית
+לאן
+לבין
+לה
+להיות
+להם
+להן
+לו
+לי
+לכם
+לכן
+למה
+למטה
+למעלה
+למקוםשבו
+למרות
+לנו
+לעבר
+לעיכן
+לפיכך
+לפני
+מאד
+מאחורי
+מאיזוסיבה
+מאין
+מאיפה
+מבלי
+מבעד
+מדוע
+מה
+מהיכן
+מול
+מחוץ
+מי
+מכאן
+מכיוון
+מלבד
+מן
+מנין
+מסוגל
+מעט
+מעטים
+מעל
+מצד
+מקוםבו
+מתחת
+מתי
+נגד
+נגר
+נו
+עד
+עז
+על
+עלי
+עליה
+עליהם
+עליהן
+עליו
+עליך
+עליכם
+עלינו
+עם
+עצמה
+עצמהם
+עצמהן
+עצמו
+עצמי
+עצמם
+עצמן
+עצמנו
+פה
+רק
+שוב
+של
+שלה
+שלהם
+שלהן
+שלו
+שלי
+שלך
+שלכה
+שלכם
+שלכן
+שלנו
+שם
+תהיה
+תחת
+\ No newline at end of file
diff --git a/static/stopwords/hi b/static/stopwords/hi

new file mode 100644 (file)

index 0000000..b4b2078
--- /dev/null
+++ b/static/stopwords/hi
@@ -0,0 +1,225 @@
+अंदर
+अत
+अदि
+अप
+अपना
+अपनि
+अपनी
+अपने
+अभि
+अभी
+आदि
+आप
+इंहिं
+इंहें
+इंहों
+इतयादि
+इत्यादि
+इन
+इनका
+इन्हीं
+इन्हें
+इन्हों
+इस
+इसका
+इसकि
+इसकी
+इसके
+इसमें
+इसि
+इसी
+इसे
+उंहिं
+उंहें
+उंहों
+उन
+उनका
+उनकि
+उनकी
+उनके
+उनको
+उन्हीं
+उन्हें
+उन्हों
+उस
+उसके
+उसि
+उसी
+उसे
+एक
+एवं
+एस
+एसे
+ऐसे
+ओर
+और
+कइ
+कई
+कर
+करता
+करते
+करना
+करने
+करें
+कहते
+कहा
+का
+काफि
+काफ़ी
+कि
+किंहें
+किंहों
+कितना
+किन्हें
+किन्हों
+किया
+किर
+किस
+किसि
+किसी
+किसे
+की
+कुछ
+कुल
+के
+को
+कोइ
+कोई
+कोन
+कोनसा
+कौन
+कौनसा
+गया
+घर
+जब
+जहाँ
+जहां
+जा
+जिंहें
+जिंहों
+जितना
+जिधर
+जिन
+जिन्हें
+जिन्हों
+जिस
+जिसे
+जीधर
+जेसा
+जेसे
+जैसा
+जैसे
+जो
+तक
+तब
+तरह
+तिंहें
+तिंहों
+तिन
+तिन्हें
+तिन्हों
+तिस
+तिसे
+तो
+था
+थि
+थी
+थे
+दबारा
+दवारा
+दिया
+दुसरा
+दुसरे
+दूसरे
+दो
+द्वारा
+न
+नहिं
+नहीं
+ना
+निचे
+निहायत
+नीचे
+ने
+पर
+पहले
+पुरा
+पूरा
+पे
+फिर
+बनि
+बनी
+बहि
+बही
+बहुत
+बाद
+बाला
+बिलकुल
+भि
+भितर
+भी
+भीतर
+मगर
+मानो
+मे
+में
+यदि
+यह
+यहाँ
+यहां
+यहि
+यही
+या
+यिह
+ये
+रखें
+रवासा
+रहा
+रहे
+ऱ्वासा
+लिए
+लिये
+लेकिन
+व
+वगेरह
+वरग
+वर्ग
+वह
+वहाँ
+वहां
+वहिं
+वहीं
+वाले
+वुह
+वे
+वग़ैरह
+संग
+सकता
+सकते
+सबसे
+सभि
+सभी
+साथ
+साबुत
+साभ
+सारा
+से
+सो
+हि
+ही
+हुअ
+हुआ
+हुइ
+हुई
+हुए
+हे
+हें
+है
+हैं
+हो
+होता
+होति
+होती
+होते
+होना
+होने
+\ No newline at end of file
diff --git a/static/stopwords/hr b/static/stopwords/hr

new file mode 100644 (file)

index 0000000..64388b0
--- /dev/null
+++ b/static/stopwords/hr
@@ -0,0 +1,179 @@
+a
+ako
+ali
+bi
+bih
+bila
+bili
+bilo
+bio
+bismo
+biste
+biti
+bumo
+da
+do
+duž
+ga
+hoće
+hoćemo
+hoćete
+hoćeš
+hoću
+i
+iako
+ih
+ili
+iz
+ja
+je
+jedna
+jedne
+jedno
+jer
+jesam
+jesi
+jesmo
+jest
+jeste
+jesu
+jim
+joj
+još
+ju
+kada
+kako
+kao
+koja
+koje
+koji
+kojima
+koju
+kroz
+li
+me
+mene
+meni
+mi
+mimo
+moj
+moja
+moje
+mu
+na
+nad
+nakon
+nam
+nama
+nas
+naš
+naša
+naše
+našeg
+ne
+nego
+neka
+neki
+nekog
+neku
+nema
+netko
+neće
+nećemo
+nećete
+nećeš
+neću
+nešto
+ni
+nije
+nikoga
+nikoje
+nikoju
+nisam
+nisi
+nismo
+niste
+nisu
+njega
+njegov
+njegova
+njegovo
+njemu
+njezin
+njezina
+njezino
+njih
+njihov
+njihova
+njihovo
+njim
+njima
+njoj
+nju
+no
+o
+od
+odmah
+on
+ona
+oni
+ono
+ova
+pa
+pak
+po
+pod
+pored
+prije
+s
+sa
+sam
+samo
+se
+sebe
+sebi
+si
+smo
+ste
+su
+sve
+svi
+svog
+svoj
+svoja
+svoje
+svom
+ta
+tada
+taj
+tako
+te
+tebe
+tebi
+ti
+to
+toj
+tome
+tu
+tvoj
+tvoja
+tvoje
+u
+uz
+vam
+vama
+vas
+vaš
+vaša
+vaše
+već
+vi
+vrlo
+za
+zar
+će
+ćemo
+ćete
+ćeš
+ću
+što
+\ No newline at end of file
diff --git a/static/stopwords/hu b/static/stopwords/hu

new file mode 100644 (file)

index 0000000..3d92c94
--- /dev/null
+++ b/static/stopwords/hu
@@ -0,0 +1,1185 @@
+a
+abba
+abban
+abbã³l
+abból
+addig
+ahhoz
+ahogy
+ahol
+aki
+akik
+akkor
+akár
+akã¡r
+alapján
+alapjã¡n
+alatt
+alatta
+alattad
+alattam
+alattatok
+alattuk
+alattunk
+alá
+alád
+alájuk
+alám
+alánk
+alátok
+alã¡
+alã¡d
+alã¡juk
+alã¡m
+alã¡nk
+alã¡tok
+alã³l
+alã³la
+alã³lad
+alã³lam
+alã³latok
+alã³luk
+alã³lunk
+alól
+alóla
+alólad
+alólam
+alólatok
+alóluk
+alólunk
+amely
+amelybol
+amelyek
+amelyekben
+amelyeket
+amelyet
+amelyik
+amelynek
+ami
+amikor
+amit
+amolyan
+amott
+amãg
+amíg
+annak
+annál
+annã¡l
+arra
+arrã³l
+arról
+attã³l
+attól
+az
+aznap
+azok
+azokat
+azokba
+azokban
+azokbã³l
+azokból
+azokhoz
+azokig
+azokkal
+azokká
+azokkã¡
+azoknak
+azoknál
+azoknã¡l
+azokon
+azokra
+azokrã³l
+azokról
+azoktã³l
+azoktól
+azokã©rt
+azokért
+azon
+azonban
+azonnal
+azt
+aztán
+aztã¡n
+azután
+azzal
+azzá
+azzã¡
+azã©rt
+azért
+bal
+balra
+ban
+be
+belã©
+belã©d
+belã©jã¼k
+belã©m
+belã©nk
+belã©tek
+belã¼l
+belå‘le
+belå‘led
+belå‘lem
+belå‘letek
+belå‘lã¼k
+belå‘lã¼nk
+belé
+beléd
+beléjük
+belém
+belénk
+belétek
+belül
+belőle
+belőled
+belőlem
+belőletek
+belőlük
+belőlünk
+ben
+benne
+benned
+bennem
+bennetek
+bennã¼k
+bennã¼nk
+bennük
+bennünk
+bár
+bárcsak
+bármilyen
+bã¡r
+bã¡rcsak
+bã¡rmilyen
+bãºcsãº
+búcsú
+cikk
+cikkek
+cikkeket
+csak
+csakhogy
+csupán
+csupã¡n
+de
+dehogy
+e
+ebbe
+ebben
+ebbå‘l
+ebből
+eddig
+egy
+egyebek
+egyebet
+egyedã¼l
+egyedül
+egyelå‘re
+egyelőre
+egyes
+egyet
+egyetlen
+egyik
+egymás
+egymã¡s
+egyre
+egyszerre
+egyã©b
+egyã¼tt
+egyéb
+együtt
+egã©sz
+egã©szen
+egész
+egészen
+ehhez
+ekkor
+el
+eleinte
+ellen
+ellenes
+elleni
+ellenã©re
+ellenére
+elmondta
+elså‘
+elså‘k
+elså‘sorban
+elså‘t
+elsõ
+első
+elsők
+elsősorban
+elsőt
+elã©
+elã©d
+elã©g
+elã©jã¼k
+elã©m
+elã©nk
+elã©tek
+elå‘bb
+elå‘l
+elå‘le
+elå‘led
+elå‘lem
+elå‘letek
+elå‘lã¼k
+elå‘lã¼nk
+elå‘szã¶r
+elå‘tt
+elå‘tte
+elå‘tted
+elå‘ttem
+elå‘ttetek
+elå‘ttã¼k
+elå‘ttã¼nk
+elå‘zå‘
+elé
+eléd
+elég
+eléjük
+elém
+elénk
+elétek
+elõ
+elõször
+elõtt
+elő
+előbb
+elől
+előle
+előled
+előlem
+előletek
+előlük
+előlünk
+először
+előtt
+előtte
+előtted
+előttem
+előttetek
+előttük
+előttünk
+előző
+emilyen
+engem
+ennek
+ennyi
+ennã©l
+ennél
+enyã©m
+enyém
+erre
+errå‘l
+erről
+esetben
+ettå‘l
+ettől
+ez
+ezek
+ezekbe
+ezekben
+ezekbå‘l
+ezekből
+ezeken
+ezeket
+ezekhez
+ezekig
+ezekkel
+ezekkã©
+ezekké
+ezeknek
+ezeknã©l
+ezeknél
+ezekre
+ezekrå‘l
+ezekről
+ezektå‘l
+ezektől
+ezekã©rt
+ezekért
+ezen
+ezentãºl
+ezentúl
+ezer
+ezret
+ezt
+ezután
+ezutã¡n
+ezzel
+ezzã©
+ezzé
+ezã©rt
+ezért
+fel
+fele
+felek
+felet
+felett
+felã©
+felé
+fent
+fenti
+fã©l
+fã¶lã©
+fél
+fölé
+gyakran
+ha
+hallã³
+halló
+hamar
+hanem
+harmadik
+harmadikat
+harminc
+hat
+hatodik
+hatodikat
+hatot
+hatvan
+helyett
+hetedik
+hetediket
+hetet
+hetven
+hirtelen
+hiszen
+hiába
+hiã¡ba
+hogy
+hogyan
+hol
+holnap
+holnapot
+honnan
+hova
+hozzá
+hozzád
+hozzájuk
+hozzám
+hozzánk
+hozzátok
+hozzã¡
+hozzã¡d
+hozzã¡juk
+hozzã¡m
+hozzã¡nk
+hozzã¡tok
+hurrá
+hurrã¡
+huszadik
+hány
+hányszor
+hármat
+három
+hát
+hátha
+hátulsó
+hã¡ny
+hã¡nyszor
+hã¡rmat
+hã¡rom
+hã¡t
+hã¡tha
+hã¡tulsã³
+hã©t
+hãºsz
+hét
+húsz
+ide
+ide-ð¾da
+ide-оda
+idã©n
+idén
+igazán
+igazã¡n
+igen
+ill
+ill.
+illetve
+ilyen
+ilyenkor
+immár
+immã¡r
+inkább
+inkã¡bb
+is
+ismã©t
+ismét
+ison
+itt
+jelenleg
+jobban
+jobbra
+jã³
+jã³l
+jã³lesik
+jã³val
+jã¶vå‘re
+jó
+jól
+jólesik
+jóval
+jövőre
+kell
+kellene
+kellett
+kelljen
+keressünk
+keresztül
+ketten
+kettå‘
+kettå‘t
+kettő
+kettőt
+kevã©s
+kevés
+ki
+kiben
+kibå‘l
+kiből
+kicsit
+kicsoda
+kihez
+kik
+kikbe
+kikben
+kikbå‘l
+kikből
+kiken
+kiket
+kikhez
+kikkel
+kikkã©
+kikké
+kiknek
+kiknã©l
+kiknél
+kikre
+kikrå‘l
+kikről
+kiktå‘l
+kiktől
+kikã©rt
+kikért
+kilenc
+kilencedik
+kilencediket
+kilencet
+kilencven
+kin
+kinek
+kinã©l
+kinél
+kire
+kirå‘l
+kiről
+kit
+kitå‘l
+kitől
+kivel
+kivã©
+kivé
+kiã©
+kiã©rt
+kié
+kiért
+korábban
+korã¡bban
+kã©pest
+kã©rem
+kã©rlek
+kã©sz
+kã©så‘
+kã©så‘bb
+kã©så‘n
+kã©t
+kã©tszer
+kã¶rã¼l
+kã¶szã¶nhetå‘en
+kã¶szã¶nã¶m
+kã¶zben
+kã¶zel
+kã¶zepesen
+kã¶zepã©n
+kã¶zã©
+kã¶zã¶tt
+kã¶zã¼l
+kã¼lã¶n
+kã¼lã¶nben
+kã¼lã¶nbã¶zå‘
+kã¼lã¶nbã¶zå‘bb
+kã¼lã¶nbã¶zå‘ek
+képest
+kérem
+kérlek
+kész
+késő
+később
+későn
+két
+kétszer
+kívül
+körül
+köszönhetően
+köszönöm
+közben
+közel
+közepesen
+közepén
+közé
+között
+közül
+külön
+különben
+különböző
+különbözőbb
+különbözőek
+lassan
+le
+legalább
+legalã¡bb
+legyen
+lehet
+lehetetlen
+lehetett
+lehetå‘leg
+lehetå‘sã©g
+lehetőleg
+lehetőség
+lenne
+lenni
+lennã©k
+lennã©nek
+lennék
+lennének
+lesz
+leszek
+lesznek
+leszã¼nk
+leszünk
+lett
+lettek
+lettem
+lettã¼nk
+lettünk
+lã©vå‘
+lévő
+ma
+maga
+magad
+magam
+magatokat
+magukat
+magunkat
+magát
+magã¡t
+mai
+majd
+majdnem
+manapság
+manapsã¡g
+meg
+megcsinál
+megcsinálnak
+megcsinã¡l
+megcsinã¡lnak
+megint
+megvan
+mellett
+mellette
+melletted
+mellettem
+mellettetek
+mellettã¼k
+mellettã¼nk
+mellettük
+mellettünk
+mellã©
+mellã©d
+mellã©jã¼k
+mellã©m
+mellã©nk
+mellã©tek
+mellå‘l
+mellå‘le
+mellå‘led
+mellå‘lem
+mellå‘letek
+mellå‘lã¼k
+mellå‘lã¼nk
+mellé
+melléd
+melléjük
+mellém
+mellénk
+mellétek
+mellől
+mellőle
+mellőled
+mellőlem
+mellőletek
+mellőlük
+mellőlünk
+mely
+melyek
+melyik
+mennyi
+mert
+mi
+miatt
+miatta
+miattad
+miattam
+miattatok
+miattuk
+miattunk
+mibe
+miben
+mibå‘l
+miből
+mihez
+mik
+mikbe
+mikben
+mikbå‘l
+mikből
+miken
+miket
+mikhez
+mikkel
+mikkã©
+mikké
+miknek
+miknã©l
+miknél
+mikor
+mikre
+mikrå‘l
+mikről
+miktå‘l
+miktől
+mikã©rt
+mikért
+milyen
+min
+mind
+mindegyik
+mindegyiket
+minden
+mindenesetre
+mindenki
+mindent
+mindenã¼tt
+mindenütt
+mindig
+mindketten
+minek
+minket
+mint
+mintha
+minã©l
+minél
+mire
+mirå‘l
+miről
+mit
+mitå‘l
+mitől
+mivel
+mivã©
+mivé
+miã©rt
+miért
+mondta
+most
+mostanáig
+mostanã¡ig
+már
+más
+másik
+másikat
+másnap
+második
+másodszor
+mások
+másokat
+mást
+mã¡r
+mã¡s
+mã¡sik
+mã¡sikat
+mã¡snap
+mã¡sodik
+mã¡sodszor
+mã¡sok
+mã¡sokat
+mã¡st
+mã©g
+mã©gis
+mãg
+mã¶gã©
+mã¶gã©d
+mã¶gã©jã¼k
+mã¶gã©m
+mã¶gã©nk
+mã¶gã©tek
+mã¶gã¶tt
+mã¶gã¶tte
+mã¶gã¶tted
+mã¶gã¶ttem
+mã¶gã¶ttetek
+mã¶gã¶ttã¼k
+mã¶gã¶ttã¼nk
+mã¶gã¼l
+mã¶gã¼le
+mã¶gã¼led
+mã¶gã¼lem
+mã¶gã¼letek
+mã¶gã¼lã¼k
+mã¶gã¼lã¼nk
+mãºltkor
+mãºlva
+még
+mégis
+míg
+mögé
+mögéd
+mögéjük
+mögém
+mögénk
+mögétek
+mögött
+mögötte
+mögötted
+mögöttem
+mögöttetek
+mögöttük
+mögöttünk
+mögül
+mögüle
+mögüled
+mögülem
+mögületek
+mögülük
+mögülünk
+múltkor
+múlva
+na
+nagy
+nagyobb
+nagyon
+naponta
+napot
+ne
+negyedik
+negyediket
+negyven
+neked
+nekem
+neki
+nekik
+nektek
+nekã¼nk
+nekünk
+nem
+nemcsak
+nemrã©g
+nemrég
+nincs
+nyolc
+nyolcadik
+nyolcadikat
+nyolcat
+nyolcvan
+nála
+nálad
+nálam
+nálatok
+náluk
+nálunk
+nã¡la
+nã¡lad
+nã¡lam
+nã¡latok
+nã¡luk
+nã¡lunk
+nã©gy
+nã©gyet
+nã©ha
+nã©hã¡ny
+nã©lkã¼l
+négy
+négyet
+néha
+néhány
+nélkül
+o
+oda
+ok
+olyan
+onnan
+ott
+pedig
+persze
+pár
+pã¡r
+pã©ldã¡ul
+például
+rajta
+rajtad
+rajtam
+rajtatok
+rajtuk
+rajtunk
+rendben
+rosszul
+rá
+rád
+rájuk
+rám
+ránk
+rátok
+rã¡
+rã¡d
+rã¡juk
+rã¡m
+rã¡nk
+rã¡tok
+rã©gen
+rã©gã³ta
+rã©szã©re
+rã³la
+rã³lad
+rã³lam
+rã³latok
+rã³luk
+rã³lunk
+rã¶gtã¶n
+régen
+régóta
+részére
+róla
+rólad
+rólam
+rólatok
+róluk
+rólunk
+rögtön
+s
+saját
+se
+sem
+semmi
+semmilyen
+semmisã©g
+semmiség
+senki
+soha
+sok
+sokan
+sokat
+sokkal
+sokszor
+sokáig
+sokã¡ig
+során
+sorã¡n
+stb.
+szemben
+szerbusz
+szerint
+szerinte
+szerinted
+szerintem
+szerintetek
+szerintã¼k
+szerintã¼nk
+szerintük
+szerintünk
+szervusz
+szinte
+számára
+száz
+századik
+százat
+szã¡mã¡ra
+szã¡z
+szã¡zadik
+szã¡zat
+szã©pen
+szãves
+szãvesen
+szãveskedjã©k
+szépen
+szét
+szíves
+szívesen
+szíveskedjék
+så‘t
+sőt
+talán
+talã¡n
+tavaly
+te
+tegnap
+tegnapelå‘tt
+tegnapelőtt
+tehát
+tehã¡t
+tele
+teljes
+tessã©k
+tessék
+ti
+tied
+titeket
+tizedik
+tizediket
+tizenegy
+tizenegyedik
+tizenhat
+tizenhárom
+tizenhã¡rom
+tizenhã©t
+tizenhét
+tizenkettedik
+tizenkettå‘
+tizenkettő
+tizenkilenc
+tizenkã©t
+tizenkét
+tizennyolc
+tizennã©gy
+tizennégy
+tizenã¶t
+tizenöt
+tizet
+tovább
+további
+továbbá
+tovã¡bb
+tovã¡bbi
+távol
+tã¡vol
+tã©ged
+tã©nyleg
+tãz
+tã¶bb
+tã¶bbi
+tã¶bbszã¶r
+tãºl
+tå‘le
+tå‘led
+tå‘lem
+tå‘letek
+tå‘lã¼k
+tå‘lã¼nk
+téged
+tényleg
+tíz
+több
+többi
+többször
+túl
+tőle
+tőled
+tőlem
+tőletek
+tőlük
+tőlünk
+ugyanakkor
+ugyanez
+ugyanis
+ugye
+urak
+uram
+urat
+utoljára
+utoljã¡ra
+utolsã³
+utolsó
+után
+utána
+utã¡n
+vagy
+vagyis
+vagyok
+vagytok
+vagyunk
+vajon
+valahol
+valaki
+valakit
+valamelyik
+valami
+valamint
+való
+van
+vannak
+vele
+veled
+velem
+veletek
+velã¼k
+velã¼nk
+velük
+velünk
+vissza
+viszlát
+viszlã¡t
+viszont
+viszontlátásra
+viszontlã¡tã¡sra
+volna
+volnának
+volnã¡nak
+volnã©k
+volnék
+volt
+voltak
+voltam
+voltunk
+vã©gre
+vã©gã©n
+vã©gã¼l
+végre
+végén
+végül
+által
+általában
+ám
+át
+ã¡ltal
+ã¡ltalã¡ban
+ã¡m
+ã¡t
+ã©ljen
+ã©n
+ã©rte
+ã©rted
+ã©rtem
+ã©rtetek
+ã©rtã¼k
+ã©rtã¼nk
+ã©s
+ã©v
+ã©vben
+ã©ve
+ã©vek
+ã©ves
+ã©vi
+ã©vvel
+ãgy
+ã³ta
+ã¶n
+ã¶nbe
+ã¶nben
+ã¶nbå‘l
+ã¶nhã¶z
+ã¶nnek
+ã¶nnel
+ã¶nnã©l
+ã¶nre
+ã¶nrå‘l
+ã¶nt
+ã¶ntå‘l
+ã¶nã©rt
+ã¶nã¶k
+ã¶nã¶kbe
+ã¶nã¶kben
+ã¶nã¶kbå‘l
+ã¶nã¶ket
+ã¶nã¶khã¶z
+ã¶nã¶kkel
+ã¶nã¶knek
+ã¶nã¶knã©l
+ã¶nã¶kre
+ã¶nã¶krå‘l
+ã¶nã¶ktå‘l
+ã¶nã¶kã©rt
+ã¶nã¶kã¶n
+ã¶nã¶n
+ã¶t
+ã¶tven
+ã¶tã¶dik
+ã¶tã¶diket
+ã¶tã¶t
+ãºgy
+ãºgyis
+ãºgynevezett
+ãºjra
+ãºr
+å‘
+å‘k
+å‘ket
+å‘t
+éljen
+én
+éppen
+érte
+érted
+értem
+értetek
+értük
+értünk
+és
+év
+évben
+éve
+évek
+éves
+évi
+évvel
+így
+óta
+õ
+õk
+õket
+ön
+önbe
+önben
+önből
+önhöz
+önnek
+önnel
+önnél
+önre
+önről
+önt
+öntől
+önért
+önök
+önökbe
+önökben
+önökből
+önöket
+önökhöz
+önökkel
+önöknek
+önöknél
+önökre
+önökről
+önöktől
+önökért
+önökön
+önön
+össze
+öt
+ötven
+ötödik
+ötödiket
+ötöt
+úgy
+úgyis
+úgynevezett
+új
+újabb
+újra
+úr
+ő
+ők
+őket
+őt
+\ No newline at end of file
diff --git a/static/stopwords/hy b/static/stopwords/hy

new file mode 100644 (file)

index 0000000..327af43
--- /dev/null
+++ b/static/stopwords/hy
@@ -0,0 +1,45 @@
+այդ
+այլ
+այն
+այս
+դու
+դուք
+եմ
+են
+ենք
+ես
+եք
+է
+էի
+էին
+էինք
+էիր
+էիք
+էր
+ըստ
+թ
+ի
+ին
+իսկ
+իր
+կամ
+համար
+հետ
+հետո
+մենք
+մեջ
+մի
+ն
+նա
+նաև
+նրա
+նրանք
+որ
+որը
+որոնք
+որպես
+ու
+ում
+պիտի
+վրա
+և
+\ No newline at end of file
diff --git a/static/stopwords/id b/static/stopwords/id

new file mode 100644 (file)

index 0000000..28b6fe3
--- /dev/null
+++ b/static/stopwords/id
@@ -0,0 +1,758 @@
+ada
+adalah
+adanya
+adapun
+agak
+agaknya
+agar
+akan
+akankah
+akhir
+akhiri
+akhirnya
+aku
+akulah
+amat
+amatlah
+anda
+andalah
+antar
+antara
+antaranya
+apa
+apaan
+apabila
+apakah
+apalagi
+apatah
+artinya
+asal
+asalkan
+atas
+atau
+ataukah
+ataupun
+awal
+awalnya
+bagai
+bagaikan
+bagaimana
+bagaimanakah
+bagaimanapun
+bagi
+bagian
+bahkan
+bahwa
+bahwasanya
+baik
+bakal
+bakalan
+balik
+banyak
+bapak
+baru
+bawah
+beberapa
+begini
+beginian
+beginikah
+beginilah
+begitu
+begitukah
+begitulah
+begitupun
+bekerja
+belakang
+belakangan
+belum
+belumlah
+benar
+benarkah
+benarlah
+berada
+berakhir
+berakhirlah
+berakhirnya
+berapa
+berapakah
+berapalah
+berapapun
+berarti
+berawal
+berbagai
+berdatangan
+beri
+berikan
+berikut
+berikutnya
+berjumlah
+berkali-kali
+berkata
+berkehendak
+berkeinginan
+berkenaan
+berlainan
+berlalu
+berlangsung
+berlebihan
+bermacam
+bermacam-macam
+bermaksud
+bermula
+bersama
+bersama-sama
+bersiap
+bersiap-siap
+bertanya
+bertanya-tanya
+berturut
+berturut-turut
+bertutur
+berujar
+berupa
+besar
+betul
+betulkah
+biasa
+biasanya
+bila
+bilakah
+bisa
+bisakah
+boleh
+bolehkah
+bolehlah
+buat
+bukan
+bukankah
+bukanlah
+bukannya
+bulan
+bung
+cara
+caranya
+cukup
+cukupkah
+cukuplah
+cuma
+dahulu
+dalam
+dan
+dapat
+dari
+daripada
+datang
+dekat
+demi
+demikian
+demikianlah
+dengan
+depan
+di
+dia
+diakhiri
+diakhirinya
+dialah
+diantara
+diantaranya
+diberi
+diberikan
+diberikannya
+dibuat
+dibuatnya
+didapat
+didatangkan
+digunakan
+diibaratkan
+diibaratkannya
+diingat
+diingatkan
+diinginkan
+dijawab
+dijelaskan
+dijelaskannya
+dikarenakan
+dikatakan
+dikatakannya
+dikerjakan
+diketahui
+diketahuinya
+dikira
+dilakukan
+dilalui
+dilihat
+dimaksud
+dimaksudkan
+dimaksudkannya
+dimaksudnya
+diminta
+dimintai
+dimisalkan
+dimulai
+dimulailah
+dimulainya
+dimungkinkan
+dini
+dipastikan
+diperbuat
+diperbuatnya
+dipergunakan
+diperkirakan
+diperlihatkan
+diperlukan
+diperlukannya
+dipersoalkan
+dipertanyakan
+dipunyai
+diri
+dirinya
+disampaikan
+disebut
+disebutkan
+disebutkannya
+disini
+disinilah
+ditambahkan
+ditandaskan
+ditanya
+ditanyai
+ditanyakan
+ditegaskan
+ditujukan
+ditunjuk
+ditunjuki
+ditunjukkan
+ditunjukkannya
+ditunjuknya
+dituturkan
+dituturkannya
+diucapkan
+diucapkannya
+diungkapkan
+dong
+dua
+dulu
+empat
+enggak
+enggaknya
+entah
+entahlah
+guna
+gunakan
+hal
+hampir
+hanya
+hanyalah
+hari
+harus
+haruslah
+harusnya
+hendak
+hendaklah
+hendaknya
+hingga
+ia
+ialah
+ibarat
+ibaratkan
+ibaratnya
+ibu
+ikut
+ingat
+ingat-ingat
+ingin
+inginkah
+inginkan
+ini
+inikah
+inilah
+itu
+itukah
+itulah
+jadi
+jadilah
+jadinya
+jangan
+jangankan
+janganlah
+jauh
+jawab
+jawaban
+jawabnya
+jelas
+jelaskan
+jelaslah
+jelasnya
+jika
+jikalau
+juga
+jumlah
+jumlahnya
+justru
+kala
+kalau
+kalaulah
+kalaupun
+kalian
+kami
+kamilah
+kamu
+kamulah
+kan
+kapan
+kapankah
+kapanpun
+karena
+karenanya
+kasus
+kata
+katakan
+katakanlah
+katanya
+ke
+keadaan
+kebetulan
+kecil
+kedua
+keduanya
+keinginan
+kelamaan
+kelihatan
+kelihatannya
+kelima
+keluar
+kembali
+kemudian
+kemungkinan
+kemungkinannya
+kenapa
+kepada
+kepadanya
+kesampaian
+keseluruhan
+keseluruhannya
+keterlaluan
+ketika
+khususnya
+kini
+kinilah
+kira
+kira-kira
+kiranya
+kita
+kitalah
+kok
+kurang
+lagi
+lagian
+lah
+lain
+lainnya
+lalu
+lama
+lamanya
+lanjut
+lanjutnya
+lebih
+lewat
+lima
+luar
+macam
+maka
+makanya
+makin
+malah
+malahan
+mampu
+mampukah
+mana
+manakala
+manalagi
+masa
+masalah
+masalahnya
+masih
+masihkah
+masing
+masing-masing
+mau
+maupun
+melainkan
+melakukan
+melalui
+melihat
+melihatnya
+memang
+memastikan
+memberi
+memberikan
+membuat
+memerlukan
+memihak
+meminta
+memintakan
+memisalkan
+memperbuat
+mempergunakan
+memperkirakan
+memperlihatkan
+mempersiapkan
+mempersoalkan
+mempertanyakan
+mempunyai
+memulai
+memungkinkan
+menaiki
+menambahkan
+menandaskan
+menanti
+menanti-nanti
+menantikan
+menanya
+menanyai
+menanyakan
+mendapat
+mendapatkan
+mendatang
+mendatangi
+mendatangkan
+menegaskan
+mengakhiri
+mengapa
+mengatakan
+mengatakannya
+mengenai
+mengerjakan
+mengetahui
+menggunakan
+menghendaki
+mengibaratkan
+mengibaratkannya
+mengingat
+mengingatkan
+menginginkan
+mengira
+mengucapkan
+mengucapkannya
+mengungkapkan
+menjadi
+menjawab
+menjelaskan
+menuju
+menunjuk
+menunjuki
+menunjukkan
+menunjuknya
+menurut
+menuturkan
+menyampaikan
+menyangkut
+menyatakan
+menyebutkan
+menyeluruh
+menyiapkan
+merasa
+mereka
+merekalah
+merupakan
+meski
+meskipun
+meyakini
+meyakinkan
+minta
+mirip
+misal
+misalkan
+misalnya
+mula
+mulai
+mulailah
+mulanya
+mungkin
+mungkinkah
+nah
+naik
+namun
+nanti
+nantinya
+nyaris
+nyatanya
+oleh
+olehnya
+pada
+padahal
+padanya
+pak
+paling
+panjang
+pantas
+para
+pasti
+pastilah
+penting
+pentingnya
+per
+percuma
+perlu
+perlukah
+perlunya
+pernah
+persoalan
+pertama
+pertama-tama
+pertanyaan
+pertanyakan
+pihak
+pihaknya
+pukul
+pula
+pun
+punya
+rasa
+rasanya
+rata
+rupanya
+saat
+saatnya
+saja
+sajalah
+saling
+sama
+sama-sama
+sambil
+sampai
+sampai-sampai
+sampaikan
+sana
+sangat
+sangatlah
+satu
+saya
+sayalah
+se
+sebab
+sebabnya
+sebagai
+sebagaimana
+sebagainya
+sebagian
+sebaik
+sebaik-baiknya
+sebaiknya
+sebaliknya
+sebanyak
+sebegini
+sebegitu
+sebelum
+sebelumnya
+sebenarnya
+seberapa
+sebesar
+sebetulnya
+sebisanya
+sebuah
+sebut
+sebutlah
+sebutnya
+secara
+secukupnya
+sedang
+sedangkan
+sedemikian
+sedikit
+sedikitnya
+seenaknya
+segala
+segalanya
+segera
+seharusnya
+sehingga
+seingat
+sejak
+sejauh
+sejenak
+sejumlah
+sekadar
+sekadarnya
+sekali
+sekali-kali
+sekalian
+sekaligus
+sekalipun
+sekarang
+sekecil
+seketika
+sekiranya
+sekitar
+sekitarnya
+sekurang-kurangnya
+sekurangnya
+sela
+selagi
+selain
+selaku
+selalu
+selama
+selama-lamanya
+selamanya
+selanjutnya
+seluruh
+seluruhnya
+semacam
+semakin
+semampu
+semampunya
+semasa
+semasih
+semata
+semata-mata
+semaunya
+sementara
+semisal
+semisalnya
+sempat
+semua
+semuanya
+semula
+sendiri
+sendirian
+sendirinya
+seolah
+seolah-olah
+seorang
+sepanjang
+sepantasnya
+sepantasnyalah
+seperlunya
+seperti
+sepertinya
+sepihak
+sering
+seringnya
+serta
+serupa
+sesaat
+sesama
+sesampai
+sesegera
+sesekali
+seseorang
+sesuatu
+sesuatunya
+sesudah
+sesudahnya
+setelah
+setempat
+setengah
+seterusnya
+setiap
+setiba
+setibanya
+setidak-tidaknya
+setidaknya
+setinggi
+seusai
+sewaktu
+siap
+siapa
+siapakah
+siapapun
+sini
+sinilah
+soal
+soalnya
+suatu
+sudah
+sudahkah
+sudahlah
+supaya
+tadi
+tadinya
+tahu
+tahun
+tak
+tambah
+tambahnya
+tampak
+tampaknya
+tandas
+tandasnya
+tanpa
+tanya
+tanyakan
+tanyanya
+tapi
+tegas
+tegasnya
+telah
+tempat
+tengah
+tentang
+tentu
+tentulah
+tentunya
+tepat
+terakhir
+terasa
+terbanyak
+terdahulu
+terdapat
+terdiri
+terhadap
+terhadapnya
+teringat
+teringat-ingat
+terjadi
+terjadilah
+terjadinya
+terkira
+terlalu
+terlebih
+terlihat
+termasuk
+ternyata
+tersampaikan
+tersebut
+tersebutlah
+tertentu
+tertuju
+terus
+terutama
+tetap
+tetapi
+tiap
+tiba
+tiba-tiba
+tidak
+tidakkah
+tidaklah
+tiga
+tinggi
+toh
+tunjuk
+turut
+tutur
+tuturnya
+ucap
+ucapnya
+ujar
+ujarnya
+umum
+umumnya
+ungkap
+ungkapnya
+untuk
+usah
+usai
+waduh
+wah
+wahai
+waktu
+waktunya
+walau
+walaupun
+wong
+yaitu
+yakin
+yakni
+yang
+\ No newline at end of file
diff --git a/static/stopwords/it b/static/stopwords/it

new file mode 100644 (file)

index 0000000..2003b42
--- /dev/null
+++ b/static/stopwords/it
@@ -0,0 +1,660 @@
+a
+abbastanza
+abbia
+abbiamo
+abbiano
+abbiate
+accidenti
+ad
+adesso
+affinche
+agl
+agli
+ahime
+ahimã¨
+ahimè
+ai
+al
+alcuna
+alcuni
+alcuno
+all
+alla
+alle
+allo
+allora
+altre
+altri
+altrimenti
+altro
+altrove
+altrui
+anche
+ancora
+anni
+anno
+ansa
+anticipo
+assai
+attesa
+attraverso
+avanti
+avemmo
+avendo
+avente
+aver
+avere
+averlo
+avesse
+avessero
+avessi
+avessimo
+aveste
+avesti
+avete
+aveva
+avevamo
+avevano
+avevate
+avevi
+avevo
+avrai
+avranno
+avrebbe
+avrebbero
+avrei
+avremmo
+avremo
+avreste
+avresti
+avrete
+avrà
+avrò
+avuta
+avute
+avuti
+avuto
+basta
+ben
+bene
+benissimo
+berlusconi
+brava
+bravo
+buono
+c
+casa
+caso
+cento
+certa
+certe
+certi
+certo
+che
+chi
+chicchessia
+chiunque
+ci
+ciascuna
+ciascuno
+cima
+cinque
+cio
+cioe
+cioã¨
+cioè
+circa
+citta
+città
+cittã
+ciã²
+ciò
+co
+codesta
+codesti
+codesto
+cogli
+coi
+col
+colei
+coll
+coloro
+colui
+come
+cominci
+comprare
+comunque
+con
+concernente
+conciliarsi
+conclusione
+consecutivi
+consecutivo
+consiglio
+contro
+cortesia
+cos
+cosa
+cosi
+cosã¬
+così
+cui
+d
+da
+dagl
+dagli
+dai
+dal
+dall
+dalla
+dalle
+dallo
+dappertutto
+davanti
+degl
+degli
+dei
+del
+dell
+della
+delle
+dello
+dentro
+detto
+deve
+devo
+di
+dice
+dietro
+dire
+dirimpetto
+diventa
+diventare
+diventato
+dopo
+doppio
+dov
+dove
+dovra
+dovrà
+dovrã
+dovunque
+due
+dunque
+durante
+e
+ebbe
+ebbero
+ebbi
+ecc
+ecco
+ed
+effettivamente
+egli
+ella
+entrambi
+eppure
+era
+erano
+eravamo
+eravate
+eri
+ero
+esempio
+esse
+essendo
+esser
+essere
+essi
+ex
+fa
+faccia
+facciamo
+facciano
+facciate
+faccio
+facemmo
+facendo
+facesse
+facessero
+facessi
+facessimo
+faceste
+facesti
+faceva
+facevamo
+facevano
+facevate
+facevi
+facevo
+fai
+fanno
+farai
+faranno
+fare
+farebbe
+farebbero
+farei
+faremmo
+faremo
+fareste
+faresti
+farete
+farà
+farò
+fatto
+favore
+fece
+fecero
+feci
+fin
+finalmente
+finche
+fine
+fino
+forse
+forza
+fosse
+fossero
+fossi
+fossimo
+foste
+fosti
+fra
+frattempo
+fu
+fui
+fummo
+fuori
+furono
+futuro
+generale
+gente
+gia
+giacche
+giorni
+giorno
+giu
+già
+giã
+gli
+gliela
+gliele
+glieli
+glielo
+gliene
+governo
+grande
+grazie
+gruppo
+ha
+haha
+hai
+hanno
+ho
+i
+ie
+ieri
+il
+improvviso
+in
+inc
+indietro
+infatti
+inoltre
+insieme
+intanto
+intorno
+invece
+io
+l
+la
+lasciato
+lato
+lavoro
+le
+lei
+li
+lo
+lontano
+loro
+lui
+lungo
+luogo
+là
+lã
+ma
+macche
+magari
+maggior
+mai
+male
+malgrado
+malissimo
+mancanza
+marche
+me
+medesimo
+mediante
+meglio
+meno
+mentre
+mesi
+mezzo
+mi
+mia
+mie
+miei
+mila
+miliardi
+milioni
+minimi
+ministro
+mio
+modo
+molta
+molti
+moltissimo
+molto
+momento
+mondo
+mosto
+nazionale
+ne
+negl
+negli
+nei
+nel
+nell
+nella
+nelle
+nello
+nemmeno
+neppure
+nessun
+nessuna
+nessuno
+niente
+no
+noi
+nome
+non
+nondimeno
+nonostante
+nonsia
+nostra
+nostre
+nostri
+nostro
+novanta
+nove
+nulla
+nuovi
+nuovo
+o
+od
+oggi
+ogni
+ognuna
+ognuno
+oltre
+oppure
+ora
+ore
+osi
+ossia
+ottanta
+otto
+paese
+parecchi
+parecchie
+parecchio
+parte
+partendo
+peccato
+peggio
+per
+perche
+perchã¨
+perchè
+perché
+percio
+perciã²
+perciò
+perfino
+pero
+persino
+persone
+perã²
+però
+piedi
+pieno
+piglia
+piu
+piuttosto
+piã¹
+più
+po
+pochissimo
+poco
+poi
+poiche
+possa
+possedere
+posteriore
+posto
+potrebbe
+preferibilmente
+presa
+press
+prima
+primo
+principalmente
+probabilmente
+promesso
+proprio
+puo
+pure
+purtroppo
+puã²
+può
+qua
+qualche
+qualcosa
+qualcuna
+qualcuno
+quale
+quali
+qualunque
+quando
+quanta
+quante
+quanti
+quanto
+quantunque
+quarto
+quasi
+quattro
+quel
+quella
+quelle
+quelli
+quello
+quest
+questa
+queste
+questi
+questo
+qui
+quindi
+quinto
+realmente
+recente
+recentemente
+registrazione
+relativo
+riecco
+rispetto
+salvo
+sara
+sarai
+saranno
+sarebbe
+sarebbero
+sarei
+saremmo
+saremo
+sareste
+saresti
+sarete
+sarà
+sarã
+sarò
+scola
+scopo
+scorso
+se
+secondo
+seguente
+seguito
+sei
+sembra
+sembrare
+sembrato
+sembrava
+sembri
+sempre
+senza
+sette
+si
+sia
+siamo
+siano
+siate
+siete
+sig
+solito
+solo
+soltanto
+sono
+sopra
+soprattutto
+sotto
+spesso
+srl
+sta
+stai
+stando
+stanno
+starai
+staranno
+starebbe
+starebbero
+starei
+staremmo
+staremo
+stareste
+staresti
+starete
+starà
+starò
+stata
+state
+stati
+stato
+stava
+stavamo
+stavano
+stavate
+stavi
+stavo
+stemmo
+stessa
+stesse
+stessero
+stessi
+stessimo
+stesso
+steste
+stesti
+stette
+stettero
+stetti
+stia
+stiamo
+stiano
+stiate
+sto
+su
+sua
+subito
+successivamente
+successivo
+sue
+sugl
+sugli
+sui
+sul
+sull
+sulla
+sulle
+sullo
+suo
+suoi
+tale
+tali
+talvolta
+tanto
+te
+tempo
+terzo
+th
+ti
+titolo
+torino
+tra
+tranne
+tre
+trenta
+triplo
+troppo
+trovato
+tu
+tua
+tue
+tuo
+tuoi
+tutta
+tuttavia
+tutte
+tutti
+tutto
+uguali
+ulteriore
+ultimo
+un
+una
+uno
+uomo
+va
+vai
+vale
+vari
+varia
+varie
+vario
+verso
+vi
+via
+vicino
+visto
+vita
+voi
+volta
+volte
+vostra
+vostre
+vostri
+vostro
+ã¨
+è
+\ No newline at end of file
diff --git a/static/stopwords/ja b/static/stopwords/ja

new file mode 100644 (file)

index 0000000..38735ee
--- /dev/null
+++ b/static/stopwords/ja
@@ -0,0 +1,134 @@
+あそこ
+あっ
+あの
+あのかた
+あの人
+あり
+あります
+ある
+あれ
+い
+いう
+います
+いる
+う
+うち
+え
+お
+および
+おり
+おります
+か
+かつて
+から
+が
+き
+ここ
+こちら
+こと
+この
+これ
+これら
+さ
+さらに
+し
+しかし
+する
+ず
+せ
+せる
+そこ
+そして
+その
+その他
+その後
+それ
+それぞれ
+それで
+た
+ただし
+たち
+ため
+たり
+だ
+だっ
+だれ
+つ
+て
+で
+でき
+できる
+です
+では
+でも
+と
+という
+といった
+とき
+ところ
+として
+とともに
+とも
+と共に
+どこ
+どの
+な
+ない
+なお
+なかっ
+ながら
+なく
+なっ
+など
+なに
+なら
+なり
+なる
+なん
+に
+において
+における
+について
+にて
+によって
+により
+による
+に対して
+に対する
+に関する
+の
+ので
+のみ
+は
+ば
+へ
+ほか
+ほとんど
+ほど
+ます
+また
+または
+まで
+も
+もの
+ものの
+や
+よう
+より
+ら
+られ
+られる
+れ
+れる
+を
+ん
+何
+及び
+彼
+彼女
+我々
+特に
+私
+私達
+貴方
+貴方方
+\ No newline at end of file
diff --git a/static/stopwords/ko b/static/stopwords/ko

new file mode 100644 (file)

index 0000000..4465f0f
--- /dev/null
+++ b/static/stopwords/ko
@@ -0,0 +1,679 @@
+!
+"
+$
+%
+&
+'
+(
+)
+*
++
+,
+-
+.
+...
+0
+1
+2
+3
+4
+5
+6
+7
+8
+9
+;
+<
+=
+>
+?
+@
+\
+^
+_
+`
+|
+~
+·
+—
+——
+‘
+’
+“
+”
+…
+、
+。
+〈
+〉
+《
+》
+가
+가까스로
+가령
+각
+각각
+각자
+각종
+갖고말하자면
+같다
+같이
+개의치않고
+거니와
+거바
+거의
+것
+것과 같이
+것들
+게다가
+게우다
+겨우
+견지에서
+결과에 이르다
+결국
+결론을 낼 수 있다
+겸사겸사
+고려하면
+고로
+곧
+공동으로
+과
+과연
+관계가 있다
+관계없이
+관련이 있다
+관하여
+관한
+관해서는
+구
+구체적으로
+구토하다
+그
+그들
+그때
+그래
+그래도
+그래서
+그러나
+그러니
+그러니까
+그러면
+그러므로
+그러한즉
+그런 까닭에
+그런데
+그런즉
+그럼
+그럼에도 불구하고
+그렇게 함으로써
+그렇지
+그렇지 않다면
+그렇지 않으면
+그렇지만
+그렇지않으면
+그리고
+그리하여
+그만이다
+그에 따르는
+그위에
+그저
+그중에서
+그치지 않다
+근거로
+근거하여
+기대여
+기점으로
+기준으로
+기타
+까닭으로
+까악
+까지
+까지 미치다
+까지도
+꽈당
+끙끙
+끼익
+나
+나머지는
+남들
+남짓
+너
+너희
+너희들
+네
+넷
+년
+논하지 않다
+놀라다
+누가 알겠는가
+누구
+다른
+다른 방면으로
+다만
+다섯
+다소
+다수
+다시 말하자면
+다시말하면
+다음
+다음에
+다음으로
+단지
+답다
+당신
+당장
+대로 하다
+대하면
+대하여
+대해 말하자면
+대해서
+댕그
+더구나
+더군다나
+더라도
+더불어
+더욱더
+더욱이는
+도달하다
+도착하다
+동시에
+동안
+된바에야
+된이상
+두번째로
+둘
+둥둥
+뒤따라
+뒤이어
+든간에
+들
+등
+등등
+딩동
+따라
+따라서
+따위
+따지지 않다
+딱
+때
+때가 되어
+때문에
+또
+또한
+뚝뚝
+라 해도
+령
+로
+로 인하여
+로부터
+로써
+륙
+를
+마음대로
+마저
+마저도
+마치
+막론하고
+만 못하다
+만약
+만약에
+만은 아니다
+만이 아니다
+만일
+만큼
+말하자면
+말할것도 없고
+매
+매번
+메쓰겁다
+몇
+모
+모두
+무렵
+무릎쓰고
+무슨
+무엇
+무엇때문에
+물론
+및
+바꾸어말하면
+바꾸어말하자면
+바꾸어서 말하면
+바꾸어서 한다면
+바꿔 말하면
+바로
+바와같이
+밖에 안된다
+반대로
+반대로 말하자면
+반드시
+버금
+보는데서
+보다더
+보드득
+본대로
+봐
+봐라
+부류의 사람들
+부터
+불구하고
+불문하고
+붕붕
+비걱거리다
+비교적
+비길수 없다
+비로소
+비록
+비슷하다
+비추어 보아
+비하면
+뿐만 아니라
+뿐만아니라
+뿐이다
+삐걱
+삐걱거리다
+사
+삼
+상대적으로 말하자면
+생각한대로
+설령
+설마
+설사
+셋
+소생
+소인
+솨
+쉿
+습니까
+습니다
+시각
+시간
+시작하여
+시초에
+시키다
+실로
+심지어
+아
+아니
+아니나다를가
+아니라면
+아니면
+아니었다면
+아래윗
+아무거나
+아무도
+아야
+아울러
+아이
+아이고
+아이구
+아이야
+아이쿠
+아하
+아홉
+안 그러면
+않기 위하여
+않기 위해서
+알 수 있다
+알았어
+앗
+앞에서
+앞의것
+야
+약간
+양자
+어
+어기여차
+어느
+어느 년도
+어느것
+어느곳
+어느때
+어느쪽
+어느해
+어디
+어때
+어떠한
+어떤
+어떤것
+어떤것들
+어떻게
+어떻해
+어이
+어째서
+어쨋든
+어쩔수 없다
+어찌
+어찌됏든
+어찌됏어
+어찌하든지
+어찌하여
+언제
+언젠가
+얼마
+얼마 안 되는 것
+얼마간
+얼마나
+얼마든지
+얼마만큼
+얼마큼
+엉엉
+에
+에 가서
+에 달려 있다
+에 대해
+에 있다
+에 한하다
+에게
+에서
+여
+여기
+여덟
+여러분
+여보시오
+여부
+여섯
+여전히
+여차
+연관되다
+연이서
+영
+영차
+옆사람
+예
+예를 들면
+예를 들자면
+예컨대
+예하면
+오
+오로지
+오르다
+오자마자
+오직
+오호
+오히려
+와
+와 같은 사람들
+와르르
+와아
+왜
+왜냐하면
+외에도
+요만큼
+요만한 것
+요만한걸
+요컨대
+우르르
+우리
+우리들
+우선
+우에 종합한것과같이
+운운
+월
+위에서 서술한바와같이
+위하여
+위해서
+윙윙
+육
+으로
+으로 인하여
+으로서
+으로써
+을
+응
+응당
+의
+의거하여
+의지하여
+의해
+의해되다
+의해서
+이
+이 되다
+이 때문에
+이 밖에
+이 외에
+이 정도의
+이것
+이곳
+이때
+이라면
+이래
+이러이러하다
+이러한
+이런
+이럴정도로
+이렇게 많은 것
+이렇게되면
+이렇게말하자면
+이렇구나
+이로 인하여
+이르기까지
+이리하여
+이만큼
+이번
+이봐
+이상
+이어서
+이었다
+이와 같다
+이와 같은
+이와 반대로
+이와같다면
+이외에도
+이용하여
+이유만으로
+이젠
+이지만
+이쪽
+이천구
+이천육
+이천칠
+이천팔
+인 듯하다
+인젠
+일
+일것이다
+일곱
+일단
+일때
+일반적으로
+일지라도
+임에 틀림없다
+입각하여
+입장에서
+잇따라
+있다
+자
+자기
+자기집
+자마자
+자신
+잠깐
+잠시
+저
+저것
+저것만큼
+저기
+저쪽
+저희
+전부
+전자
+전후
+점에서 보아
+정도에 이르다
+제
+제각기
+제외하고
+조금
+조차
+조차도
+졸졸
+좀
+좋아
+좍좍
+주룩주룩
+주저하지 않고
+줄은 몰랏다
+줄은모른다
+중에서
+중의하나
+즈음하여
+즉
+즉시
+지든지
+지만
+지말고
+진짜로
+쪽으로
+차라리
+참
+참나
+첫번째로
+쳇
+총적으로
+총적으로 말하면
+총적으로 보면
+칠
+콸콸
+쾅쾅
+쿵
+타다
+타인
+탕탕
+토하다
+통하여
+툭
+퉤
+틈타
+팍
+팔
+퍽
+펄렁
+하
+하게될것이다
+하게하다
+하겠는가
+하고 있다
+하고있었다
+하곤하였다
+하구나
+하기 때문에
+하기 위하여
+하기는한데
+하기만 하면
+하기보다는
+하기에
+하나
+하느니
+하는 김에
+하는 편이 낫다
+하는것도
+하는것만 못하다
+하는것이 낫다
+하는바
+하더라도
+하도다
+하도록시키다
+하도록하다
+하든지
+하려고하다
+하마터면
+하면 할수록
+하면된다
+하면서
+하물며
+하여금
+하여야
+하자마자
+하지 않는다면
+하지 않도록
+하지마
+하지마라
+하지만
+하하
+한 까닭에
+한 이유는
+한 후
+한다면
+한다면 몰라도
+한데
+한마디
+한적이있다
+한켠으로는
+한항목
+할 따름이다
+할 생각이다
+할 줄 안다
+할 지경이다
+할 힘이 있다
+할때
+할만하다
+할망정
+할뿐
+할수있다
+할수있어
+할줄알다
+할지라도
+할지언정
+함께
+해도된다
+해도좋다
+해봐요
+해서는 안된다
+해야한다
+해요
+했어요
+향하다
+향하여
+향해서
+허
+허걱
+허허
+헉
+헉헉
+헐떡헐떡
+형식으로 쓰여
+혹시
+혹은
+혼자
+훨씬
+휘익
+휴
+흐흐
+흥
+힘입어
+︿
+！
+＃
+＄
+％
+＆
+（
+）
+＊
+＋
+，
+０
+１
+２
+３
+４
+５
+６
+７
+８
+９
+：
+；
+＜
+＞
+？
+＠
+［
+］
+｛
+｜
+｝
+～
+￥
+\ No newline at end of file
diff --git a/static/stopwords/ku b/static/stopwords/ku

new file mode 100644 (file)

index 0000000..7b12cfe
--- /dev/null
+++ b/static/stopwords/ku
@@ -0,0 +1,62 @@
+ئێمە
+ئێوە
+ئەم
+ئەو
+ئەوان
+ئەوەی
+بۆ
+بێ
+بێجگە
+بە
+بەبێ
+بەدەم
+بەردەم
+بەرلە
+بەرەوی
+بەرەوە
+بەلای
+بەپێی
+تۆ
+تێ
+جگە
+دوای
+دوو
+دە
+دەکات
+دەگەڵ
+سەر
+لێ
+لە
+لەبابەت
+لەباتی
+لەبارەی
+لەبرێتی
+لەبن
+لەبەر
+لەبەینی
+لەدەم
+لەرێ
+لەرێگا
+لەرەوی
+لەسەر
+لەلایەن
+لەناو
+لەنێو
+لەو
+لەپێناوی
+لەژێر
+لەگەڵ
+من
+ناو
+نێوان
+هەر
+هەروەها
+و
+وەک
+پاش
+پێ
+پێش
+چەند
+کرد
+کە
+ی
+\ No newline at end of file
diff --git a/static/stopwords/la b/static/stopwords/la

new file mode 100644 (file)

index 0000000..00c5759
--- /dev/null
+++ b/static/stopwords/la
@@ -0,0 +1,49 @@
+a
+ab
+ac
+ad
+at
+atque
+aut
+autem
+cum
+de
+dum
+e
+erant
+erat
+est
+et
+etiam
+ex
+haec
+hic
+hoc
+in
+ita
+me
+nec
+neque
+non
+per
+qua
+quae
+quam
+qui
+quibus
+quidem
+quo
+quod
+re
+rebus
+rem
+res
+sed
+si
+sic
+sunt
+tamen
+tandem
+te
+ut
+vel
+\ No newline at end of file
diff --git a/static/stopwords/lt b/static/stopwords/lt

new file mode 100644 (file)

index 0000000..6b5242a
--- /dev/null
+++ b/static/stopwords/lt
@@ -0,0 +1,474 @@
+abi
+abidvi
+abiejose
+abiejuose
+abiejø
+abiem
+abigaliai
+abipus
+abu
+abudu
+ai
+ana
+anaiptol
+anaisiais
+anajai
+anajam
+anajame
+anapus
+anas
+anasai
+anasis
+anei
+aniedvi
+anieji
+aniesiems
+anoji
+anojo
+anojoje
+anokia
+anoks
+anosiomis
+anosioms
+anosios
+anosiose
+anot
+ant
+antai
+anuodu
+anuoju
+anuosiuose
+anuosius
+anàja
+anàjà
+anàjá
+anàsias
+anøjø
+apie
+aplink
+ar
+arba
+argi
+arti
+aukðèiau
+að
+be
+bei
+beje
+bemaþ
+bent
+bet
+betgi
+beveik
+dar
+dargi
+daugmaþ
+deja
+dëka
+dël
+dëlei
+dëlto
+ech
+et
+gal
+galbût
+galgi
+gan
+gana
+gi
+greta
+idant
+iki
+ir
+irgi
+it
+itin
+ið
+iðilgai
+iðvis
+jaisiais
+jajai
+jajam
+jajame
+jei
+jeigu
+ji
+jiedu
+jiedvi
+jieji
+jiesiems
+jinai
+jis
+jisai
+jog
+joji
+jojo
+jojoje
+jokia
+joks
+josiomis
+josioms
+josios
+josiose
+judu
+judvi
+juk
+jumis
+jums
+jumyse
+juodu
+juoju
+juosiuose
+juosius
+jus
+jàja
+jàjà
+jàsias
+jájá
+jøjø
+jûs
+jûsiðkis
+jûsiðkë
+jûsø
+kad
+kada
+kadangi
+kai
+kaip
+kaipgi
+kas
+katra
+katras
+katriedvi
+katruodu
+kaþin
+kaþkas
+kaþkatra
+kaþkatras
+kaþkokia
+kaþkoks
+kaþkuri
+kaþkuris
+kiaurai
+kiek
+kiekvienas
+kieno
+kita
+kitas
+kitokia
+kitoks
+kodël
+kokia
+koks
+kol
+kolei
+kone
+kuomet
+kur
+kurgi
+kuri
+kuriedvi
+kuris
+kuriuodu
+lai
+lig
+ligi
+link
+lyg
+man
+manaisiais
+manajai
+manajam
+manajame
+manas
+manasai
+manasis
+mane
+manieji
+maniesiems
+manim
+manimi
+maniðkis
+maniðkë
+mano
+manoji
+manojo
+manojoje
+manosiomis
+manosioms
+manosios
+manosiose
+manuoju
+manuosiuose
+manuosius
+manyje
+manàja
+manàjà
+manàjá
+manàsias
+manæs
+manøjø
+mat
+maþdaug
+maþne
+mes
+mudu
+mudvi
+mumis
+mums
+mumyse
+mus
+mûsiðkis
+mûsiðkë
+mûsø
+na
+nagi
+ne
+nebe
+nebent
+negi
+negu
+nei
+nejau
+nejaugi
+nekaip
+nelyginant
+nes
+net
+netgi
+netoli
+neva
+nors
+nuo
+në
+o
+ogi
+oi
+paeiliui
+pagal
+pakeliui
+palaipsniui
+palei
+pas
+pasak
+paskos
+paskui
+paskum
+pat
+pati
+patiems
+paties
+pats
+patys
+patá
+paèiais
+paèiam
+paèiame
+paèiu
+paèiuose
+paèius
+paèiø
+per
+pernelyg
+pirm
+pirma
+pirmiau
+po
+prie
+prieð
+prieðais
+pro
+pusiau
+rasi
+rodos
+sau
+savaisiais
+savajai
+savajam
+savajame
+savas
+savasai
+savasis
+save
+savieji
+saviesiems
+savimi
+saviðkis
+saviðkë
+savo
+savoji
+savojo
+savojoje
+savosiomis
+savosioms
+savosios
+savosiose
+savuoju
+savuosiuose
+savuosius
+savyje
+savàja
+savàjà
+savàjá
+savàsias
+savæs
+savøjø
+skersai
+skradþiai
+staèiai
+su
+sulig
+ta
+tad
+tai
+taigi
+taip
+taipogi
+taisiais
+tajai
+tajam
+tajame
+tamsta
+tarp
+tarsi
+tartum
+tarytum
+tas
+tasai
+tau
+tavaisiais
+tavajai
+tavajam
+tavajame
+tavas
+tavasai
+tavasis
+tave
+tavieji
+taviesiems
+tavimi
+taviðkis
+taviðkë
+tavo
+tavoji
+tavojo
+tavojoje
+tavosiomis
+tavosioms
+tavosios
+tavosiose
+tavuoju
+tavuosiuose
+tavuosius
+tavyje
+tavàja
+tavàjà
+tavàjá
+tavàsias
+tavæs
+tavøjø
+taèiau
+te
+tegu
+tegul
+tiedvi
+tieji
+ties
+tiesiems
+tiesiog
+tik
+tikriausiai
+tiktai
+toji
+tojo
+tojoje
+tokia
+toks
+tol
+tolei
+toliau
+tosiomis
+tosioms
+tosios
+tosiose
+tu
+tuodu
+tuoju
+tuosiuose
+tuosius
+turbût
+tàja
+tàjà
+tàjá
+tàsias
+tøjø
+tûlas
+uþ
+uþtat
+uþvis
+va
+vai
+viduj
+vidury
+vien
+vienas
+vienokia
+vienoks
+vietoj
+virð
+virðuj
+virðum
+vis
+vis dëlto
+visa
+visas
+visgi
+visokia
+visoks
+vos
+vël
+vëlgi
+ypaè
+á
+ákypai
+ástriþai
+ðalia
+ðe
+ði
+ðiaisiais
+ðiajai
+ðiajam
+ðiajame
+ðiapus
+ðiedvi
+ðieji
+ðiesiems
+ðioji
+ðiojo
+ðiojoje
+ðiokia
+ðioks
+ðiosiomis
+ðiosioms
+ðiosios
+ðiosiose
+ðis
+ðisai
+ðit
+ðita
+ðitas
+ðitiedvi
+ðitokia
+ðitoks
+ðituodu
+ðiuodu
+ðiuoju
+ðiuosiuose
+ðiuosius
+ðiàja
+ðiàjà
+ðiàsias
+ðiøjø
+ðtai
+ðájá
+þemiau
+\ No newline at end of file
diff --git a/static/stopwords/lv b/static/stopwords/lv

new file mode 100644 (file)

index 0000000..71fb149
--- /dev/null
+++ b/static/stopwords/lv
@@ -0,0 +1,161 @@
+aiz
+ap
+apakš
+apakšpus
+ar
+arī
+augšpus
+bet
+bez
+bija
+biji
+biju
+bijām
+bijāt
+būs
+būsi
+būsiet
+būsim
+būt
+būšu
+caur
+diemžēl
+diezin
+droši
+dēļ
+esam
+esat
+esi
+esmu
+gan
+gar
+iekam
+iekams
+iekām
+iekāms
+iekš
+iekšpus
+ik
+ir
+it
+itin
+iz
+ja
+jau
+jeb
+jebšu
+jel
+jo
+jā
+ka
+kamēr
+kaut
+kolīdz
+kopš
+kā
+kļuva
+kļuvi
+kļuvu
+kļuvām
+kļuvāt
+kļūs
+kļūsi
+kļūsiet
+kļūsim
+kļūst
+kļūstam
+kļūstat
+kļūsti
+kļūstu
+kļūt
+kļūšu
+labad
+lai
+lejpus
+līdz
+līdzko
+ne
+nebūt
+nedz
+nekā
+nevis
+nezin
+no
+nu
+nē
+otrpus
+pa
+par
+pat
+pie
+pirms
+pret
+priekš
+pār
+pēc
+starp
+tad
+tak
+tapi
+taps
+tapsi
+tapsiet
+tapsim
+tapt
+tapāt
+tapšu
+taču
+te
+tiec
+tiek
+tiekam
+tiekat
+tieku
+tik
+tika
+tikai
+tiki
+tikko
+tiklab
+tiklīdz
+tiks
+tiksiet
+tiksim
+tikt
+tiku
+tikvien
+tikām
+tikāt
+tikšu
+tomēr
+topat
+turpretim
+turpretī
+tā
+tādēļ
+tālab
+tāpēc
+un
+uz
+vai
+var
+varat
+varēja
+varēji
+varēju
+varējām
+varējāt
+varēs
+varēsi
+varēsiet
+varēsim
+varēt
+varēšu
+vien
+virs
+virspus
+vis
+viņpus
+zem
+ārpus
+šaipus
+\ No newline at end of file
diff --git a/static/stopwords/mr b/static/stopwords/mr

new file mode 100644 (file)

index 0000000..2034713
--- /dev/null
+++ b/static/stopwords/mr
@@ -0,0 +1,99 @@
+अधिक
+अनेक
+अशी
+असलयाचे
+असलेल्या
+असा
+असून
+असे
+आज
+आणि
+आता
+आपल्या
+आला
+आली
+आले
+आहे
+आहेत
+एक
+एका
+कमी
+करणयात
+करून
+का
+काम
+काय
+काही
+किवा
+की
+केला
+केली
+केले
+कोटी
+गेल्या
+घेऊन
+जात
+झाला
+झाली
+झाले
+झालेल्या
+टा
+डॉ
+तर
+तरी
+तसेच
+ता
+ती
+तीन
+ते
+तो
+त्या
+त्याचा
+त्याची
+त्याच्या
+त्याना
+त्यानी
+त्यामुळे
+त्री
+दिली
+दोन
+न
+नाही
+निर्ण्य
+पण
+पम
+परयतन
+पाटील
+म
+मात्र
+माहिती
+मी
+मुबी
+म्हणजे
+म्हणाले
+म्हणून
+या
+याचा
+याची
+याच्या
+याना
+यानी
+येणार
+येत
+येथील
+येथे
+लाख
+व
+व्यकत
+सर्व
+सागित्ले
+सुरू
+हजार
+हा
+ही
+हे
+होणार
+होत
+होता
+होती
+होते
+\ No newline at end of file
diff --git a/static/stopwords/ms b/static/stopwords/ms

new file mode 100644 (file)

index 0000000..268a0b7
--- /dev/null
+++ b/static/stopwords/ms
@@ -0,0 +1,475 @@
+abdul
+abdullah
+acara
+ada
+adalah
+ahmad
+air
+akan
+akhbar
+akhir
+aktiviti
+alam
+amat
+amerika
+anak
+anggota
+antara
+antarabangsa
+apa
+apabila
+april
+as
+asas
+asean
+asia
+asing
+atas
+atau
+australia
+awal
+awam
+bagaimanapun
+bagi
+bahagian
+bahan
+baharu
+bahawa
+baik
+bandar
+bank
+banyak
+barangan
+baru
+baru-baru
+bawah
+beberapa
+bekas
+beliau
+belum
+berada
+berakhir
+berbanding
+berdasarkan
+berharap
+berikutan
+berjaya
+berjumlah
+berkaitan
+berkata
+berkenaan
+berlaku
+bermula
+bernama
+bernilai
+bersama
+berubah
+besar
+bhd
+bidang
+bilion
+bn
+boleh
+bukan
+bulan
+bursa
+cadangan
+china
+dagangan
+dalam
+dan
+dana
+dapat
+dari
+daripada
+dasar
+datang
+datuk
+demikian
+dengan
+depan
+derivatives
+dewan
+di
+diadakan
+dibuka
+dicatatkan
+dijangka
+diniagakan
+dis
+disember
+ditutup
+dolar
+dr
+dua
+dunia
+ekonomi
+eksekutif
+eksport
+empat
+enam
+faedah
+feb
+global
+hadapan
+hanya
+harga
+hari
+hasil
+hingga
+hubungan
+ia
+iaitu
+ialah
+indeks
+india
+indonesia
+industri
+ini
+islam
+isnin
+isu
+itu
+jabatan
+jalan
+jan
+jawatan
+jawatankuasa
+jepun
+jika
+jualan
+juga
+julai
+jumaat
+jumlah
+jun
+juta
+kadar
+kalangan
+kali
+kami
+kata
+katanya
+kaunter
+kawasan
+ke
+keadaan
+kecil
+kedua
+kedua-dua
+kedudukan
+kekal
+kementerian
+kemudahan
+kenaikan
+kenyataan
+kepada
+kepentingan
+keputusan
+kerajaan
+kerana
+kereta
+kerja
+kerjasama
+kes
+keselamatan
+keseluruhan
+kesihatan
+ketika
+ketua
+keuntungan
+kewangan
+khamis
+kini
+kira-kira
+kita
+klci
+klibor
+komposit
+kontrak
+kos
+kuala
+kuasa
+kukuh
+kumpulan
+lagi
+lain
+langkah
+laporan
+lebih
+lepas
+lima
+lot
+luar
+lumpur
+mac
+mahkamah
+mahu
+majlis
+makanan
+maklumat
+malam
+malaysia
+mana
+manakala
+masa
+masalah
+masih
+masing-masing
+masyarakat
+mata
+media
+mei
+melalui
+melihat
+memandangkan
+memastikan
+membantu
+membawa
+memberi
+memberikan
+membolehkan
+membuat
+mempunyai
+menambah
+menarik
+menawarkan
+mencapai
+mencatatkan
+mendapat
+mendapatkan
+menerima
+menerusi
+mengadakan
+mengambil
+mengenai
+menggalakkan
+menggunakan
+mengikut
+mengumumkan
+mengurangkan
+meningkat
+meningkatkan
+menjadi
+menjelang
+menokok
+menteri
+menunjukkan
+menurut
+menyaksikan
+menyediakan
+mereka
+merosot
+merupakan
+mesyuarat
+minat
+minggu
+minyak
+modal
+mohd
+mudah
+mungkin
+naik
+najib
+nasional
+negara
+negara-negara
+negeri
+niaga
+nilai
+nov
+ogos
+okt
+oleh
+operasi
+orang
+pada
+pagi
+paling
+pameran
+papan
+para
+paras
+parlimen
+parti
+pasaran
+pasukan
+pegawai
+pejabat
+pekerja
+pelabur
+pelaburan
+pelancongan
+pelanggan
+pelbagai
+peluang
+pembangunan
+pemberita
+pembinaan
+pemimpin
+pendapatan
+pendidikan
+penduduk
+penerbangan
+pengarah
+pengeluaran
+pengerusi
+pengguna
+pengurusan
+peniaga
+peningkatan
+penting
+peratus
+perdagangan
+perdana
+peringkat
+perjanjian
+perkara
+perkhidmatan
+perladangan
+perlu
+permintaan
+perniagaan
+persekutuan
+persidangan
+pertama
+pertubuhan
+pertumbuhan
+perusahaan
+peserta
+petang
+pihak
+pilihan
+pinjaman
+polis
+politik
+presiden
+prestasi
+produk
+program
+projek
+proses
+proton
+pukul
+pula
+pusat
+rabu
+rakan
+rakyat
+ramai
+rantau
+raya
+rendah
+ringgit
+rumah
+sabah
+sahaja
+saham
+sama
+sarawak
+satu
+sawit
+saya
+sdn
+sebagai
+sebahagian
+sebanyak
+sebarang
+sebelum
+sebelumnya
+sebuah
+secara
+sedang
+segi
+sehingga
+sejak
+sekarang
+sektor
+sekuriti
+selain
+selama
+selasa
+selatan
+selepas
+seluruh
+semakin
+semalam
+semasa
+sementara
+semua
+semula
+sen
+sendiri
+seorang
+sepanjang
+seperti
+sept
+september
+serantau
+seri
+serta
+sesi
+setiap
+setiausaha
+sidang
+singapura
+sini
+sistem
+sokongan
+sri
+sudah
+sukan
+suku
+sumber
+supaya
+susut
+syarikat
+syed
+tahap
+tahun
+tan
+tanah
+tanpa
+tawaran
+teknologi
+telah
+tempat
+tempatan
+tempoh
+tenaga
+tengah
+tentang
+terbaik
+terbang
+terbesar
+terbuka
+terdapat
+terhadap
+termasuk
+tersebut
+terus
+tetapi
+thailand
+tiada
+tidak
+tiga
+timbalan
+timur
+tindakan
+tinggi
+tun
+tunai
+turun
+turut
+umno
+unit
+untuk
+untung
+urus
+usaha
+utama
+walaupun
+wang
+wanita
+wilayah
+yang
+\ No newline at end of file
diff --git a/static/stopwords/nl b/static/stopwords/nl

new file mode 100644 (file)

index 0000000..9c46fa3
--- /dev/null
+++ b/static/stopwords/nl
@@ -0,0 +1,413 @@
+aan
+aangaande
+aangezien
+achte
+achter
+achterna
+af
+afgelopen
+al
+aldaar
+aldus
+alhoewel
+alias
+alle
+allebei
+alleen
+alles
+als
+alsnog
+altijd
+altoos
+ander
+andere
+anders
+anderszins
+beetje
+behalve
+behoudens
+beide
+beiden
+ben
+beneden
+bent
+bepaald
+betreffende
+bij
+bijna
+bijv
+binnen
+binnenin
+blijkbaar
+blijken
+boven
+bovenal
+bovendien
+bovengenoemd
+bovenstaand
+bovenvermeld
+buiten
+bv
+daar
+daardoor
+daarheen
+daarin
+daarna
+daarnet
+daarom
+daarop
+daaruit
+daarvanlangs
+dan
+dat
+de
+deden
+deed
+der
+derde
+derhalve
+dertig
+deze
+dhr
+die
+dikwijls
+dit
+doch
+doe
+doen
+doet
+door
+doorgaand
+drie
+duizend
+dus
+echter
+een
+eens
+eer
+eerdat
+eerder
+eerlang
+eerst
+eerste
+eigen
+eigenlijk
+elk
+elke
+en
+enig
+enige
+enigszins
+enkel
+er
+erdoor
+erg
+ergens
+etc
+etcetera
+even
+eveneens
+evenwel
+gauw
+ge
+gedurende
+geen
+gehad
+gekund
+geleden
+gelijk
+gemoeten
+gemogen
+genoeg
+geweest
+gewoon
+gewoonweg
+haar
+haarzelf
+had
+hadden
+hare
+heb
+hebben
+hebt
+hedden
+heeft
+heel
+hem
+hemzelf
+hen
+het
+hetzelfde
+hier
+hierbeneden
+hierboven
+hierin
+hierna
+hierom
+hij
+hijzelf
+hoe
+hoewel
+honderd
+hun
+hunne
+ieder
+iedere
+iedereen
+iemand
+iets
+ik
+ikzelf
+in
+inderdaad
+inmiddels
+intussen
+inzake
+is
+ja
+je
+jezelf
+jij
+jijzelf
+jou
+jouw
+jouwe
+juist
+jullie
+kan
+klaar
+kon
+konden
+krachtens
+kun
+kunnen
+kunt
+laatst
+later
+liever
+lijken
+lijkt
+maak
+maakt
+maakte
+maakten
+maar
+mag
+maken
+me
+meer
+meest
+meestal
+men
+met
+mevr
+mezelf
+mij
+mijn
+mijnent
+mijner
+mijzelf
+minder
+miss
+misschien
+missen
+mits
+mocht
+mochten
+moest
+moesten
+moet
+moeten
+mogen
+mr
+mrs
+mw
+na
+naar
+nadat
+nam
+namelijk
+nee
+neem
+negen
+nemen
+nergens
+net
+niemand
+niet
+niets
+niks
+noch
+nochtans
+nog
+nogal
+nooit
+nu
+nv
+of
+ofschoon
+om
+omdat
+omhoog
+omlaag
+omstreeks
+omtrent
+omver
+ondanks
+onder
+ondertussen
+ongeveer
+ons
+onszelf
+onze
+onzeker
+ooit
+ook
+op
+opnieuw
+opzij
+over
+overal
+overeind
+overige
+overigens
+paar
+pas
+per
+precies
+recent
+redelijk
+reeds
+rond
+rondom
+samen
+sedert
+sinds
+sindsdien
+slechts
+sommige
+spoedig
+steeds
+tamelijk
+te
+tegen
+tegenover
+tenzij
+terwijl
+thans
+tien
+tiende
+tijdens
+tja
+toch
+toe
+toen
+toenmaals
+toenmalig
+tot
+totdat
+tussen
+twee
+tweede
+u
+uit
+uitgezonderd
+uw
+vaak
+vaakwat
+van
+vanaf
+vandaan
+vanuit
+vanwege
+veel
+veeleer
+veertig
+verder
+verscheidene
+verschillende
+vervolgens
+via
+vier
+vierde
+vijf
+vijfde
+vijftig
+vol
+volgend
+volgens
+voor
+vooraf
+vooral
+vooralsnog
+voorbij
+voordat
+voordezen
+voordien
+voorheen
+voorop
+voorts
+vooruit
+vrij
+vroeg
+waar
+waarom
+waarschijnlijk
+wanneer
+want
+waren
+was
+wat
+we
+wederom
+weer
+weg
+wegens
+weinig
+wel
+weldra
+welk
+welke
+werd
+werden
+werder
+wezen
+whatever
+wie
+wiens
+wier
+wij
+wijzelf
+wil
+wilden
+willen
+word
+worden
+wordt
+zal
+ze
+zei
+zeker
+zelf
+zelfde
+zelfs
+zes
+zeven
+zich
+zichzelf
+zij
+zijn
+zijne
+zijzelf
+zo
+zoals
+zodat
+zodra
+zonder
+zou
+zouden
+zowat
+zulk
+zulke
+zullen
+zult
+\ No newline at end of file
diff --git a/static/stopwords/no b/static/stopwords/no

new file mode 100644 (file)

index 0000000..d36c367
--- /dev/null
+++ b/static/stopwords/no
@@ -0,0 +1,221 @@
+alle
+andre
+arbeid
+at
+av
+bare
+begge
+ble
+blei
+bli
+blir
+blitt
+bort
+bra
+bruke
+både
+båe
+da
+de
+deg
+dei
+deim
+deira
+deires
+dem
+den
+denne
+der
+dere
+deres
+det
+dette
+di
+din
+disse
+ditt
+du
+dykk
+dykkar
+då
+eg
+ein
+eit
+eitt
+eller
+elles
+en
+ene
+eneste
+enhver
+enn
+er
+et
+ett
+etter
+folk
+for
+fordi
+forsûke
+fra
+få
+før
+fûr
+fûrst
+gjorde
+gjûre
+god
+gå
+ha
+hadde
+han
+hans
+har
+hennar
+henne
+hennes
+her
+hjå
+ho
+hoe
+honom
+hoss
+hossen
+hun
+hva
+hvem
+hver
+hvilke
+hvilken
+hvis
+hvor
+hvordan
+hvorfor
+i
+ikke
+ikkje
+ingen
+ingi
+inkje
+inn
+innen
+inni
+ja
+jeg
+kan
+kom
+korleis
+korso
+kun
+kunne
+kva
+kvar
+kvarhelst
+kven
+kvi
+kvifor
+lage
+lang
+lik
+like
+makt
+man
+mange
+me
+med
+medan
+meg
+meget
+mellom
+men
+mens
+mer
+mest
+mi
+min
+mine
+mitt
+mot
+mye
+mykje
+må
+måte
+navn
+ned
+nei
+no
+noe
+noen
+noka
+noko
+nokon
+nokor
+nokre
+ny
+nå
+når
+og
+også
+om
+opp
+oss
+over
+part
+punkt
+på
+rett
+riktig
+samme
+sant
+seg
+selv
+si
+sia
+sidan
+siden
+sin
+sine
+sist
+sitt
+sjøl
+skal
+skulle
+slik
+slutt
+so
+som
+somme
+somt
+start
+stille
+så
+sånn
+tid
+til
+tilbake
+tilstand
+um
+under
+upp
+ut
+uten
+var
+vart
+varte
+ved
+verdi
+vere
+verte
+vi
+vil
+ville
+vite
+vore
+vors
+vort
+vår
+være
+vært
+vöre
+vört
+å
+\ No newline at end of file
diff --git a/static/stopwords/pl b/static/stopwords/pl

new file mode 100644 (file)

index 0000000..ba4bb0b
--- /dev/null
+++ b/static/stopwords/pl
@@ -0,0 +1,328 @@
+a
+aby
+ach
+acz
+aczkolwiek
+aj
+albo
+ale
+ależ
+ani
+aż
+bardziej
+bardzo
+bez
+bo
+bowiem
+by
+byli
+bym
+bynajmniej
+być
+był
+była
+było
+były
+będzie
+będą
+cali
+cała
+cały
+chce
+choć
+ci
+ciebie
+cię
+co
+cokolwiek
+coraz
+coś
+czasami
+czasem
+czemu
+czy
+czyli
+często
+daleko
+dla
+dlaczego
+dlatego
+do
+dobrze
+dokąd
+dość
+dr
+dużo
+dwa
+dwaj
+dwie
+dwoje
+dzisiaj
+dziś
+gdy
+gdyby
+gdyż
+gdzie
+gdziekolwiek
+gdzieś
+go
+godz
+hab
+i
+ich
+ii
+iii
+ile
+im
+inna
+inne
+inny
+innych
+inż
+iv
+ix
+iż
+ja
+jak
+jakaś
+jakby
+jaki
+jakichś
+jakie
+jakiś
+jakiż
+jakkolwiek
+jako
+jakoś
+je
+jeden
+jedna
+jednak
+jednakże
+jedno
+jednym
+jedynie
+jego
+jej
+jemu
+jest
+jestem
+jeszcze
+jeśli
+jeżeli
+już
+ją
+każdy
+kiedy
+kierunku
+kilka
+kilku
+kimś
+kto
+ktokolwiek
+ktoś
+która
+które
+którego
+której
+który
+których
+którym
+którzy
+ku
+lat
+lecz
+lub
+ma
+mają
+mam
+mamy
+mało
+mgr
+mi
+miał
+mimo
+między
+mnie
+mną
+mogą
+moi
+moim
+moja
+moje
+może
+możliwe
+można
+mu
+musi
+my
+mój
+na
+nad
+nam
+nami
+nas
+nasi
+nasz
+nasza
+nasze
+naszego
+naszych
+natomiast
+natychmiast
+nawet
+nic
+nich
+nie
+niech
+niego
+niej
+niemu
+nigdy
+nim
+nimi
+nią
+niż
+no
+nowe
+np
+nr
+o
+o.o.
+obok
+od
+ok
+około
+on
+ona
+one
+oni
+ono
+oraz
+oto
+owszem
+pan
+pana
+pani
+pl
+po
+pod
+podczas
+pomimo
+ponad
+ponieważ
+powinien
+powinna
+powinni
+powinno
+poza
+prawie
+prof
+przecież
+przed
+przede
+przedtem
+przez
+przy
+raz
+razie
+roku
+również
+sam
+sama
+się
+skąd
+sobie
+sobą
+sposób
+swoje
+są
+ta
+tak
+taka
+taki
+takich
+takie
+także
+tam
+te
+tego
+tej
+tel
+temu
+ten
+teraz
+też
+to
+tobie
+tobą
+toteż
+trzeba
+tu
+tutaj
+twoi
+twoim
+twoja
+twoje
+twym
+twój
+ty
+tych
+tylko
+tym
+tys
+tzw
+tę
+u
+ul
+vi
+vii
+viii
+vol
+w
+wam
+wami
+was
+wasi
+wasz
+wasza
+wasze
+we
+według
+wie
+wiele
+wielu
+więc
+więcej
+wszyscy
+wszystkich
+wszystkie
+wszystkim
+wszystko
+wtedy
+www
+wy
+właśnie
+wśród
+xi
+xii
+xiii
+xiv
+xv
+z
+za
+zapewne
+zawsze
+zaś
+ze
+zeznowu
+znowu
+znów
+został
+zł
+żaden
+żadna
+żadne
+żadnych
+że
+żeby
diff --git a/static/stopwords/pt b/static/stopwords/pt

new file mode 100644 (file)

index 0000000..53e4298
--- /dev/null
+++ b/static/stopwords/pt
@@ -0,0 +1,560 @@
+a
+acerca
+adeus
+agora
+ainda
+alem
+algmas
+algo
+algumas
+alguns
+ali
+além
+ambas
+ambos
+ano
+anos
+antes
+ao
+aonde
+aos
+apenas
+apoio
+apontar
+apos
+após
+aquela
+aquelas
+aquele
+aqueles
+aqui
+aquilo
+as
+assim
+através
+atrás
+até
+aí
+baixo
+bastante
+bem
+boa
+boas
+bom
+bons
+breve
+cada
+caminho
+catorze
+cedo
+cento
+certamente
+certeza
+cima
+cinco
+coisa
+com
+como
+comprido
+conhecido
+conselho
+contra
+contudo
+corrente
+cuja
+cujas
+cujo
+cujos
+custa
+cá
+da
+daquela
+daquelas
+daquele
+daqueles
+dar
+das
+de
+debaixo
+dela
+delas
+dele
+deles
+demais
+dentro
+depois
+desde
+desligado
+dessa
+dessas
+desse
+desses
+desta
+destas
+deste
+destes
+deve
+devem
+deverá
+dez
+dezanove
+dezasseis
+dezassete
+dezoito
+dia
+diante
+direita
+dispoe
+dispoem
+diversa
+diversas
+diversos
+diz
+dizem
+dizer
+do
+dois
+dos
+doze
+duas
+durante
+dá
+dão
+dúvida
+e
+ela
+elas
+ele
+eles
+em
+embora
+enquanto
+entao
+entre
+então
+era
+eram
+essa
+essas
+esse
+esses
+esta
+estado
+estamos
+estar
+estará
+estas
+estava
+estavam
+este
+esteja
+estejam
+estejamos
+estes
+esteve
+estive
+estivemos
+estiver
+estivera
+estiveram
+estiverem
+estivermos
+estivesse
+estivessem
+estiveste
+estivestes
+estivéramos
+estivéssemos
+estou
+está
+estás
+estávamos
+estão
+eu
+exemplo
+falta
+fará
+favor
+faz
+fazeis
+fazem
+fazemos
+fazer
+fazes
+fazia
+faço
+fez
+fim
+final
+foi
+fomos
+for
+fora
+foram
+forem
+forma
+formos
+fosse
+fossem
+foste
+fostes
+fui
+fôramos
+fôssemos
+geral
+grande
+grandes
+grupo
+ha
+haja
+hajam
+hajamos
+havemos
+havia
+hei
+hoje
+hora
+horas
+houve
+houvemos
+houver
+houvera
+houveram
+houverei
+houverem
+houveremos
+houveria
+houveriam
+houvermos
+houverá
+houverão
+houveríamos
+houvesse
+houvessem
+houvéramos
+houvéssemos
+há
+hão
+iniciar
+inicio
+ir
+irá
+isso
+ista
+iste
+isto
+já
+lado
+lhe
+lhes
+ligado
+local
+logo
+longe
+lugar
+lá
+maior
+maioria
+maiorias
+mais
+mal
+mas
+me
+mediante
+meio
+menor
+menos
+meses
+mesma
+mesmas
+mesmo
+mesmos
+meu
+meus
+mil
+minha
+minhas
+momento
+muito
+muitos
+máximo
+mês
+na
+nada
+nao
+naquela
+naquelas
+naquele
+naqueles
+nas
+nem
+nenhuma
+nessa
+nessas
+nesse
+nesses
+nesta
+nestas
+neste
+nestes
+no
+noite
+nome
+nos
+nossa
+nossas
+nosso
+nossos
+nova
+novas
+nove
+novo
+novos
+num
+numa
+numas
+nunca
+nuns
+não
+nível
+nós
+número
+o
+obra
+obrigada
+obrigado
+oitava
+oitavo
+oito
+onde
+ontem
+onze
+os
+ou
+outra
+outras
+outro
+outros
+para
+parece
+parte
+partir
+paucas
+pegar
+pela
+pelas
+pelo
+pelos
+perante
+perto
+pessoas
+pode
+podem
+poder
+poderá
+podia
+pois
+ponto
+pontos
+por
+porque
+porquê
+portanto
+posição
+possivelmente
+posso
+possível
+pouca
+pouco
+poucos
+povo
+primeira
+primeiras
+primeiro
+primeiros
+promeiro
+propios
+proprio
+própria
+próprias
+próprio
+próprios
+próxima
+próximas
+próximo
+próximos
+puderam
+pôde
+põe
+põem
+quais
+qual
+qualquer
+quando
+quanto
+quarta
+quarto
+quatro
+que
+quem
+quer
+quereis
+querem
+queremas
+queres
+quero
+questão
+quieto
+quinta
+quinto
+quinze
+quáis
+quê
+relação
+sabe
+sabem
+saber
+se
+segunda
+segundo
+sei
+seis
+seja
+sejam
+sejamos
+sem
+sempre
+sendo
+ser
+serei
+seremos
+seria
+seriam
+será
+serão
+seríamos
+sete
+seu
+seus
+sexta
+sexto
+sim
+sistema
+sob
+sobre
+sois
+somente
+somos
+sou
+sua
+suas
+são
+sétima
+sétimo
+só
+tal
+talvez
+tambem
+também
+tanta
+tantas
+tanto
+tarde
+te
+tem
+temos
+tempo
+tendes
+tenha
+tenham
+tenhamos
+tenho
+tens
+tentar
+tentaram
+tente
+tentei
+ter
+terceira
+terceiro
+terei
+teremos
+teria
+teriam
+terá
+terão
+teríamos
+teu
+teus
+teve
+tinha
+tinham
+tipo
+tive
+tivemos
+tiver
+tivera
+tiveram
+tiverem
+tivermos
+tivesse
+tivessem
+tiveste
+tivestes
+tivéramos
+tivéssemos
+toda
+todas
+todo
+todos
+trabalhar
+trabalho
+treze
+três
+tu
+tua
+tuas
+tudo
+tão
+tém
+têm
+tínhamos
+um
+uma
+umas
+uns
+usa
+usar
+vai
+vais
+valor
+veja
+vem
+vens
+ver
+verdade
+verdadeiro
+vez
+vezes
+viagem
+vindo
+vinte
+você
+vocês
+vos
+vossa
+vossas
+vosso
+vossos
+vários
+vão
+vêm
+vós
+zero
+à
+às
+área
+é
+éramos
+és
+último
+\ No newline at end of file
diff --git a/static/stopwords/ro b/static/stopwords/ro

new file mode 100644 (file)

index 0000000..6b7dbfd
--- /dev/null
+++ b/static/stopwords/ro
@@ -0,0 +1,434 @@
+a
+abia
+acea
+aceasta
+această
+aceea
+aceeasi
+acei
+aceia
+acel
+acela
+acelasi
+acele
+acelea
+acest
+acesta
+aceste
+acestea
+acestei
+acestia
+acestui
+aceşti
+aceştia
+acolo
+acord
+acum
+adica
+ai
+aia
+aibă
+aici
+aiurea
+al
+ala
+alaturi
+ale
+alea
+alt
+alta
+altceva
+altcineva
+alte
+altfel
+alti
+altii
+altul
+am
+anume
+apoi
+ar
+are
+as
+asa
+asemenea
+asta
+astazi
+astea
+astfel
+astăzi
+asupra
+atare
+atat
+atata
+atatea
+atatia
+ati
+atit
+atita
+atitea
+atitia
+atunci
+au
+avea
+avem
+aveţi
+avut
+azi
+aş
+aşadar
+aţi
+b
+ba
+bine
+bucur
+bună
+c
+ca
+cam
+cand
+capat
+care
+careia
+carora
+caruia
+cat
+catre
+caut
+ce
+cea
+ceea
+cei
+ceilalti
+cel
+cele
+celor
+ceva
+chiar
+ci
+cinci
+cind
+cine
+cineva
+cit
+cita
+cite
+citeva
+citi
+citiva
+conform
+contra
+cu
+cui
+cum
+cumva
+curând
+curînd
+când
+cât
+câte
+câtva
+câţi
+cînd
+cît
+cîte
+cîtva
+cîţi
+că
+căci
+cărei
+căror
+cărui
+către
+d
+da
+daca
+dacă
+dar
+dat
+datorită
+dată
+dau
+de
+deasupra
+deci
+decit
+degraba
+deja
+deoarece
+departe
+desi
+despre
+deşi
+din
+dinaintea
+dintr
+dintr-
+dintre
+doar
+doi
+doilea
+două
+drept
+dupa
+după
+dă
+e
+ea
+ei
+el
+ele
+era
+eram
+este
+eu
+exact
+eşti
+f
+face
+fara
+fata
+fel
+fi
+fie
+fiecare
+fii
+fim
+fiu
+fiţi
+foarte
+fost
+frumos
+fără
+g
+geaba
+graţie
+h
+halbă
+i
+ia
+iar
+ieri
+ii
+il
+imi
+in
+inainte
+inapoi
+inca
+incit
+insa
+intr
+intre
+isi
+iti
+j
+k
+l
+la
+le
+li
+lor
+lui
+lângă
+lîngă
+m
+ma
+mai
+mare
+mea
+mei
+mele
+mereu
+meu
+mi
+mie
+mine
+mod
+mult
+multa
+multe
+multi
+multă
+mulţi
+mulţumesc
+mâine
+mîine
+mă
+n
+ne
+nevoie
+ni
+nici
+niciodata
+nicăieri
+nimeni
+nimeri
+nimic
+niste
+nişte
+noastre
+noastră
+noi
+noroc
+nostri
+nostru
+nou
+noua
+nouă
+noştri
+nu
+numai
+o
+opt
+or
+ori
+oricare
+orice
+oricine
+oricum
+oricând
+oricât
+oricînd
+oricît
+oriunde
+p
+pai
+parca
+patra
+patru
+patrulea
+pe
+pentru
+peste
+pic
+pina
+plus
+poate
+pot
+prea
+prima
+primul
+prin
+printr-
+putini
+puţin
+puţina
+puţină
+până
+pînă
+r
+rog
+s
+sa
+sa-mi
+sa-ti
+sai
+sale
+sau
+se
+si
+sint
+sintem
+spate
+spre
+sub
+sunt
+suntem
+sunteţi
+sus
+sută
+sînt
+sîntem
+sînteţi
+să
+săi
+său
+t
+ta
+tale
+te
+ti
+timp
+tine
+toata
+toate
+toată
+tocmai
+tot
+toti
+totul
+totusi
+totuşi
+toţi
+trei
+treia
+treilea
+tu
+tuturor
+tăi
+tău
+u
+ul
+ului
+un
+una
+unde
+undeva
+unei
+uneia
+unele
+uneori
+unii
+unor
+unora
+unu
+unui
+unuia
+unul
+v
+va
+vi
+voastre
+voastră
+voi
+vom
+vor
+vostru
+vouă
+voştri
+vreme
+vreo
+vreun
+vă
+x
+z
+zece
+zero
+zi
+zice
+îi
+îl
+îmi
+împotriva
+în
+înainte
+înaintea
+încotro
+încât
+încît
+între
+întrucât
+întrucît
+îţi
+ăla
+ălea
+ăsta
+ăstea
+ăştia
+şapte
+şase
+şi
+ştiu
+ţi
+ţie
+\ No newline at end of file
diff --git a/static/stopwords/ru b/static/stopwords/ru

new file mode 100644 (file)

index 0000000..5db5ef1
--- /dev/null
+++ b/static/stopwords/ru
@@ -0,0 +1,559 @@
+c
+а
+алло
+без
+белый
+близко
+более
+больше
+большой
+будем
+будет
+будете
+будешь
+будто
+буду
+будут
+будь
+бы
+бывает
+бывь
+был
+была
+были
+было
+быть
+в
+важная
+важное
+важные
+важный
+вам
+вами
+вас
+ваш
+ваша
+ваше
+ваши
+вверх
+вдали
+вдруг
+ведь
+везде
+вернуться
+весь
+вечер
+взгляд
+взять
+вид
+видел
+видеть
+вместе
+вне
+вниз
+внизу
+во
+вода
+война
+вокруг
+вон
+вообще
+вопрос
+восемнадцатый
+восемнадцать
+восемь
+восьмой
+вот
+впрочем
+времени
+время
+все
+все еще
+всегда
+всего
+всем
+всеми
+всему
+всех
+всею
+всю
+всюду
+вся
+всё
+второй
+вы
+выйти
+г
+где
+главный
+глаз
+говорил
+говорит
+говорить
+год
+года
+году
+голова
+голос
+город
+да
+давать
+давно
+даже
+далекий
+далеко
+дальше
+даром
+дать
+два
+двадцатый
+двадцать
+две
+двенадцатый
+двенадцать
+дверь
+двух
+девятнадцатый
+девятнадцать
+девятый
+девять
+действительно
+дел
+делал
+делать
+делаю
+дело
+день
+деньги
+десятый
+десять
+для
+до
+довольно
+долго
+должен
+должно
+должный
+дом
+дорога
+друг
+другая
+другие
+других
+друго
+другое
+другой
+думать
+душа
+е
+его
+ее
+ей
+ему
+если
+есть
+еще
+ещё
+ею
+её
+ж
+ждать
+же
+жена
+женщина
+жизнь
+жить
+за
+занят
+занята
+занято
+заняты
+затем
+зато
+зачем
+здесь
+земля
+знать
+значит
+значить
+и
+иди
+идти
+из
+или
+им
+имеет
+имел
+именно
+иметь
+ими
+имя
+иногда
+их
+к
+каждая
+каждое
+каждые
+каждый
+кажется
+казаться
+как
+какая
+какой
+кем
+книга
+когда
+кого
+ком
+комната
+кому
+конец
+конечно
+которая
+которого
+которой
+которые
+который
+которых
+кроме
+кругом
+кто
+куда
+лежать
+лет
+ли
+лицо
+лишь
+лучше
+любить
+люди
+м
+маленький
+мало
+мать
+машина
+между
+меля
+менее
+меньше
+меня
+место
+миллионов
+мимо
+минута
+мир
+мира
+мне
+много
+многочисленная
+многочисленное
+многочисленные
+многочисленный
+мной
+мною
+мог
+могу
+могут
+мож
+может
+может быть
+можно
+можхо
+мои
+мой
+мор
+москва
+мочь
+моя
+моё
+мы
+на
+наверху
+над
+надо
+назад
+наиболее
+найти
+наконец
+нам
+нами
+народ
+нас
+начала
+начать
+наш
+наша
+наше
+наши
+не
+него
+недавно
+недалеко
+нее
+ней
+некоторый
+нельзя
+нем
+немного
+нему
+непрерывно
+нередко
+несколько
+нет
+нею
+неё
+ни
+нибудь
+ниже
+низко
+никакой
+никогда
+никто
+никуда
+ним
+ними
+них
+ничего
+ничто
+но
+новый
+нога
+ночь
+ну
+нужно
+нужный
+нх
+о
+об
+оба
+обычно
+один
+одиннадцатый
+одиннадцать
+однажды
+однако
+одного
+одной
+оказаться
+окно
+около
+он
+она
+они
+оно
+опять
+особенно
+остаться
+от
+ответить
+отец
+откуда
+отовсюду
+отсюда
+очень
+первый
+перед
+писать
+плечо
+по
+под
+подойди
+подумать
+пожалуйста
+позже
+пойти
+пока
+пол
+получить
+помнить
+понимать
+понять
+пор
+пора
+после
+последний
+посмотреть
+посреди
+потом
+потому
+почему
+почти
+правда
+прекрасно
+при
+про
+просто
+против
+процентов
+путь
+пятнадцатый
+пятнадцать
+пятый
+пять
+работа
+работать
+раз
+разве
+рано
+раньше
+ребенок
+решить
+россия
+рука
+русский
+ряд
+рядом
+с
+с кем
+сам
+сама
+сами
+самим
+самими
+самих
+само
+самого
+самой
+самом
+самому
+саму
+самый
+свет
+свое
+своего
+своей
+свои
+своих
+свой
+свою
+сделать
+сеаой
+себе
+себя
+сегодня
+седьмой
+сейчас
+семнадцатый
+семнадцать
+семь
+сидеть
+сила
+сих
+сказал
+сказала
+сказать
+сколько
+слишком
+слово
+случай
+смотреть
+сначала
+снова
+со
+собой
+собою
+советский
+совсем
+спасибо
+спросить
+сразу
+стал
+старый
+стать
+стол
+сторона
+стоять
+страна
+суть
+считать
+т
+та
+так
+такая
+также
+таки
+такие
+такое
+такой
+там
+твои
+твой
+твоя
+твоё
+те
+тебе
+тебя
+тем
+теми
+теперь
+тех
+то
+тобой
+тобою
+товарищ
+тогда
+того
+тоже
+только
+том
+тому
+тот
+тою
+третий
+три
+тринадцатый
+тринадцать
+ту
+туда
+тут
+ты
+тысяч
+у
+увидеть
+уж
+уже
+улица
+уметь
+утро
+хороший
+хорошо
+хотел бы
+хотеть
+хоть
+хотя
+хочешь
+час
+часто
+часть
+чаще
+чего
+человек
+чем
+чему
+через
+четвертый
+четыре
+четырнадцатый
+четырнадцать
+что
+чтоб
+чтобы
+чуть
+шестнадцатый
+шестнадцать
+шестой
+шесть
+эта
+эти
+этим
+этими
+этих
+это
+этого
+этой
+этом
+этому
+этот
+эту
+я
+являюсь
+\ No newline at end of file
diff --git a/static/stopwords/sk b/static/stopwords/sk

new file mode 100644 (file)

index 0000000..4e71786
--- /dev/null
+++ b/static/stopwords/sk
@@ -0,0 +1,221 @@
+a
+aby
+aj
+ak
+ako
+aký
+ale
+alebo
+and
+ani
+asi
+avšak
+až
+ba
+bez
+bol
+bola
+boli
+bolo
+bude
+budem
+budeme
+budete
+budeš
+budú
+buï
+buď
+by
+byť
+cez
+dnes
+do
+ešte
+for
+ho
+hoci
+i
+iba
+ich
+im
+iné
+iný
+ja
+je
+jeho
+jej
+jemu
+ju
+k
+kam
+každá
+každé
+každí
+každý
+kde
+kedže
+keï
+keď
+kto
+ktorou
+ktorá
+ktoré
+ktorí
+ktorý
+ku
+lebo
+len
+ma
+mať
+medzi
+menej
+mi
+mna
+mne
+mnou
+moja
+moje
+mu
+musieť
+my
+má
+máte
+mòa
+môcť
+môj
+môže
+na
+nad
+nami
+naši
+nech
+neho
+nej
+nemu
+než
+nich
+nie
+niektorý
+nielen
+nim
+nič
+no
+nová
+nové
+noví
+nový
+nám
+nás
+náš
+ním
+o
+od
+odo
+of
+on
+ona
+oni
+ono
+ony
+po
+pod
+podľa
+pokiaľ
+potom
+pre
+pred
+predo
+preto
+pretože
+prečo
+pri
+prvá
+prvé
+prví
+prvý
+práve
+pýta
+s
+sa
+seba
+sem
+si
+sme
+so
+som
+späť
+ste
+svoj
+svoje
+svojich
+svojím
+svojími
+sú
+ta
+tak
+taký
+takže
+tam
+te
+teba
+tebe
+tebou
+teda
+tej
+ten
+tento
+the
+ti
+tie
+tieto
+tiež
+to
+toho
+tohoto
+tom
+tomto
+tomu
+tomuto
+toto
+tou
+tu
+tvoj
+tvojími
+ty
+tá
+táto
+tú
+túto
+tým
+týmto
+tě
+už
+v
+vami
+vaše
+veï
+viac
+vo
+vy
+vám
+vás
+váš
+však
+všetok
+z
+za
+zo
+\9da
+áno
+èi
+èo
+èí
+òom
+òou
+òu
+či
+čo
+ďalšia
+ďalšie
+ďalší
+že
+\ No newline at end of file
diff --git a/static/stopwords/sl b/static/stopwords/sl

new file mode 100644 (file)

index 0000000..7135ed3
--- /dev/null
+++ b/static/stopwords/sl
@@ -0,0 +1,446 @@
+a
+ali
+april
+avgust
+b
+bi
+bil
+bila
+bile
+bili
+bilo
+biti
+blizu
+bo
+bodo
+bojo
+bolj
+bom
+bomo
+boste
+bova
+boš
+brez
+c
+cel
+cela
+celi
+celo
+d
+da
+daleč
+dan
+danes
+datum
+december
+deset
+deseta
+deseti
+deseto
+devet
+deveta
+deveti
+deveto
+do
+dober
+dobra
+dobri
+dobro
+dokler
+dol
+dolg
+dolga
+dolgi
+dovolj
+drug
+druga
+drugi
+drugo
+dva
+dve
+e
+eden
+en
+ena
+ene
+eni
+enkrat
+eno
+etc.
+f
+februar
+g
+g.
+ga
+ga.
+gor
+gospa
+gospod
+h
+halo
+i
+idr.
+ii
+iii
+in
+iv
+ix
+iz
+j
+januar
+jaz
+je
+ji
+jih
+jim
+jo
+julij
+junij
+jutri
+k
+kadarkoli
+kaj
+kajti
+kako
+kakor
+kamor
+kamorkoli
+kar
+karkoli
+katerikoli
+kdaj
+kdo
+kdorkoli
+ker
+ki
+kje
+kjer
+kjerkoli
+ko
+koder
+koderkoli
+koga
+komu
+kot
+kratek
+kratka
+kratke
+kratki
+l
+lahka
+lahke
+lahki
+lahko
+le
+lep
+lepa
+lepe
+lepi
+lepo
+leto
+m
+maj
+majhen
+majhna
+majhni
+malce
+malo
+manj
+marec
+me
+med
+medtem
+mene
+mesec
+mi
+midva
+midve
+mnogo
+moj
+moja
+moje
+mora
+morajo
+moram
+moramo
+morate
+moraš
+morem
+mu
+n
+na
+nad
+naj
+najina
+najino
+najmanj
+naju
+največ
+nam
+narobe
+nas
+nato
+nazaj
+naš
+naša
+naše
+ne
+nedavno
+nedelja
+nek
+neka
+nekaj
+nekatere
+nekateri
+nekatero
+nekdo
+neke
+nekega
+neki
+nekje
+neko
+nekoga
+nekoč
+ni
+nikamor
+nikdar
+nikjer
+nikoli
+nič
+nje
+njega
+njegov
+njegova
+njegovo
+njej
+njemu
+njen
+njena
+njeno
+nji
+njih
+njihov
+njihova
+njihovo
+njiju
+njim
+njo
+njun
+njuna
+njuno
+no
+nocoj
+november
+npr.
+o
+ob
+oba
+obe
+oboje
+od
+odprt
+odprta
+odprti
+okoli
+oktober
+on
+onadva
+one
+oni
+onidve
+osem
+osma
+osmi
+osmo
+oz.
+p
+pa
+pet
+peta
+petek
+peti
+peto
+po
+pod
+pogosto
+poleg
+poln
+polna
+polni
+polno
+ponavadi
+ponedeljek
+ponovno
+potem
+povsod
+pozdravljen
+pozdravljeni
+prav
+prava
+prave
+pravi
+pravo
+prazen
+prazna
+prazno
+prbl.
+precej
+pred
+prej
+preko
+pri
+pribl.
+približno
+primer
+pripravljen
+pripravljena
+pripravljeni
+proti
+prva
+prvi
+prvo
+r
+ravno
+redko
+res
+reč
+s
+saj
+sam
+sama
+same
+sami
+samo
+se
+sebe
+sebi
+sedaj
+sedem
+sedma
+sedmi
+sedmo
+sem
+september
+seveda
+si
+sicer
+skoraj
+skozi
+slab
+smo
+so
+sobota
+spet
+sreda
+srednja
+srednji
+sta
+ste
+stran
+stvar
+sva
+t
+ta
+tak
+taka
+take
+taki
+tako
+takoj
+tam
+te
+tebe
+tebi
+tega
+težak
+težka
+težki
+težko
+ti
+tista
+tiste
+tisti
+tisto
+tj.
+tja
+to
+toda
+torek
+tretja
+tretje
+tretji
+tri
+tu
+tudi
+tukaj
+tvoj
+tvoja
+tvoje
+u
+v
+vaju
+vam
+vas
+vaš
+vaša
+vaše
+ve
+vedno
+velik
+velika
+veliki
+veliko
+vendar
+ves
+več
+vi
+vidva
+vii
+viii
+visok
+visoka
+visoke
+visoki
+vsa
+vsaj
+vsak
+vsaka
+vsakdo
+vsake
+vsaki
+vsakomur
+vse
+vsega
+vsi
+vso
+včasih
+včeraj
+x
+z
+za
+zadaj
+zadnji
+zakaj
+zaprta
+zaprti
+zaprto
+zdaj
+zelo
+zunaj
+č
+če
+često
+četrta
+četrtek
+četrti
+četrto
+čez
+čigav
+š
+šest
+šesta
+šesti
+šesto
+štiri
+ž
+že
+\ No newline at end of file
diff --git a/static/stopwords/so b/static/stopwords/so

new file mode 100644 (file)

index 0000000..4153023
--- /dev/null
+++ b/static/stopwords/so
@@ -0,0 +1,30 @@
+aad
+albaabkii
+atabo
+ay
+ayaa
+ayee
+ayuu
+dhan
+hadana
+in
+inuu
+isku
+jiray
+jirtay
+ka
+kale
+kasoo
+ku
+kuu
+lakin
+markii
+oo
+si
+soo
+uga
+ugu
+uu
+waa
+waxa
+waxuu
+\ No newline at end of file
diff --git a/static/stopwords/st b/static/stopwords/st

new file mode 100644 (file)

index 0000000..92bd21d
--- /dev/null
+++ b/static/stopwords/st
@@ -0,0 +1,31 @@
+a
+ba
+bane
+bona
+e
+ea
+eaba
+empa
+ena
+ha
+hae
+hape
+ho
+hore
+ka
+ke
+la
+le
+li
+me
+mo
+moo
+ne
+o
+oa
+re
+sa
+se
+tloha
+tsa
+tse
+\ No newline at end of file
diff --git a/static/stopwords/sv b/static/stopwords/sv

new file mode 100644 (file)

index 0000000..e768342
--- /dev/null
+++ b/static/stopwords/sv
@@ -0,0 +1,418 @@
+aderton
+adertonde
+adjö
+aldrig
+alla
+allas
+allt
+alltid
+alltså
+andra
+andras
+annan
+annat
+artonde
+artonn
+att
+av
+bakom
+bara
+behöva
+behövas
+behövde
+behövt
+beslut
+beslutat
+beslutit
+bland
+blev
+bli
+blir
+blivit
+bort
+borta
+bra
+bäst
+bättre
+båda
+bådas
+dag
+dagar
+dagarna
+dagen
+de
+del
+delen
+dem
+den
+denna
+deras
+dess
+dessa
+det
+detta
+dig
+din
+dina
+dit
+ditt
+dock
+dom
+du
+där
+därför
+då
+e
+efter
+eftersom
+ej
+elfte
+eller
+elva
+emot
+en
+enkel
+enkelt
+enkla
+enligt
+ens
+er
+era
+ers
+ert
+ett
+ettusen
+fanns
+fem
+femte
+femtio
+femtionde
+femton
+femtonde
+fick
+fin
+finnas
+finns
+fjorton
+fjortonde
+fjärde
+fler
+flera
+flesta
+fram
+framför
+från
+fyra
+fyrtio
+fyrtionde
+få
+får
+fått
+följande
+för
+före
+förlåt
+förra
+första
+genast
+genom
+gick
+gjorde
+gjort
+god
+goda
+godare
+godast
+gott
+gälla
+gäller
+gällt
+gärna
+gå
+går
+gått
+gör
+göra
+ha
+hade
+haft
+han
+hans
+har
+heller
+hellre
+helst
+helt
+henne
+hennes
+hit
+hon
+honom
+hundra
+hundraen
+hundraett
+hur
+här
+hög
+höger
+högre
+högst
+i
+ibland
+icke
+idag
+igen
+igår
+imorgon
+in
+inför
+inga
+ingen
+ingenting
+inget
+innan
+inne
+inom
+inte
+inuti
+ja
+jag
+jo
+ju
+just
+jämfört
+kan
+kanske
+knappast
+kom
+komma
+kommer
+kommit
+kr
+kunde
+kunna
+kunnat
+kvar
+legat
+ligga
+ligger
+lika
+likställd
+likställda
+lilla
+lite
+liten
+litet
+länge
+längre
+längst
+lätt
+lättare
+lättast
+långsam
+långsammare
+långsammast
+långsamt
+långt
+låt
+man
+med
+mej
+mellan
+men
+mer
+mera
+mest
+mig
+min
+mina
+mindre
+minst
+mitt
+mittemot
+mot
+mycket
+många
+måste
+möjlig
+möjligen
+möjligt
+möjligtvis
+ned
+nederst
+nedersta
+nedre
+nej
+ner
+ni
+nio
+nionde
+nittio
+nittionde
+nitton
+nittonde
+nog
+noll
+nr
+nu
+nummer
+när
+nästa
+någon
+någonting
+något
+några
+nån
+nånting
+nåt
+nödvändig
+nödvändiga
+nödvändigt
+nödvändigtvis
+och
+också
+ofta
+oftast
+olika
+olikt
+om
+oss
+på
+rakt
+redan
+rätt
+sa
+sade
+sagt
+samma
+sedan
+senare
+senast
+sent
+sex
+sextio
+sextionde
+sexton
+sextonde
+sig
+sin
+sina
+sist
+sista
+siste
+sitt
+sitta
+sju
+sjunde
+sjuttio
+sjuttionde
+sjutton
+sjuttonde
+själv
+sjätte
+ska
+skall
+skulle
+slutligen
+små
+smått
+snart
+som
+stor
+stora
+stort
+större
+störst
+säga
+säger
+sämre
+sämst
+så
+sådan
+sådana
+sådant
+ta
+tack
+tar
+tidig
+tidigare
+tidigast
+tidigt
+till
+tills
+tillsammans
+tio
+tionde
+tjugo
+tjugoen
+tjugoett
+tjugonde
+tjugotre
+tjugotvå
+tjungo
+tolfte
+tolv
+tre
+tredje
+trettio
+trettionde
+tretton
+trettonde
+två
+tvåhundra
+under
+upp
+ur
+ursäkt
+ut
+utan
+utanför
+ute
+va
+vad
+var
+vara
+varför
+varifrån
+varit
+varje
+varken
+vars
+varsågod
+vart
+vem
+vems
+verkligen
+vi
+vid
+vidare
+viktig
+viktigare
+viktigast
+viktigt
+vilka
+vilkas
+vilken
+vilket
+vill
+väl
+vänster
+vänstra
+värre
+vår
+våra
+vårt
+än
+ännu
+är
+även
+åt
+åtminstone
+åtta
+åttio
+åttionde
+åttonde
+över
+övermorgon
+överst
+övre
+\ No newline at end of file
diff --git a/static/stopwords/sw b/static/stopwords/sw

new file mode 100644 (file)

index 0000000..9d54c01
--- /dev/null
+++ b/static/stopwords/sw
@@ -0,0 +1,74 @@
+akasema
+alikuwa
+alisema
+baada
+basi
+bila
+cha
+chini
+hadi
+hapo
+hata
+hivyo
+hiyo
+huku
+huo
+ili
+ilikuwa
+juu
+kama
+karibu
+katika
+kila
+kima
+kisha
+kubwa
+kutoka
+kuwa
+kwa
+kwamba
+kwenda
+kwenye
+la
+lakini
+mara
+mdogo
+mimi
+mkubwa
+mmoja
+moja
+muda
+mwenye
+na
+naye
+ndani
+ng
+ni
+nini
+nonkungu
+pamoja
+pia
+sana
+sasa
+sauti
+tafadhali
+tena
+tu
+vile
+wa
+wakati
+wake
+walikuwa
+wao
+watu
+wengine
+wote
+ya
+yake
+yangu
+yao
+yeye
+yule
+za
+zaidi
+zake
+\ No newline at end of file
diff --git a/static/stopwords/th b/static/stopwords/th

new file mode 100644 (file)

index 0000000..ed52946
--- /dev/null
+++ b/static/stopwords/th
@@ -0,0 +1,115 @@
+กล่าว
+กว่า
+กัน
+กับ
+การ
+ก็
+ก่อน
+ขณะ
+ขอ
+ของ
+ขึ้น
+คง
+ครั้ง
+ความ
+คือ
+จะ
+จัด
+จาก
+จึง
+ช่วง
+ซึ่ง
+ดัง
+ด้วย
+ด้าน
+ตั้ง
+ตั้งแต่
+ตาม
+ต่อ
+ต่าง
+ต่างๆ
+ต้อง
+ถึง
+ถูก
+ถ้า
+ทั้ง
+ทั้งนี้
+ทาง
+ทำ
+ทำให้
+ที่
+ที่สุด
+ทุก
+นอกจาก
+นัก
+นั้น
+นำ
+นี้
+น่า
+บาง
+ผล
+ผ่าน
+พบ
+พร้อม
+มา
+มาก
+มี
+ยัง
+รวม
+ระหว่าง
+รับ
+ราย
+ร่วม
+ลง
+วัน
+ว่า
+สำหรับ
+สุด
+ส่ง
+ส่วน
+หนึ่ง
+หรือ
+หลัง
+หลังจาก
+หลาย
+หาก
+อยาก
+อยู่
+อย่าง
+ออก
+อะไร
+อาจ
+อีก
+เขา
+เข้า
+เคย
+เฉพาะ
+เช่น
+เดียว
+เดียวกัน
+เนื่องจาก
+เปิด
+เปิดเผย
+เป็น
+เป็นการ
+เพราะ
+เพื่อ
+เมื่อ
+เรา
+เริ่ม
+เลย
+เห็น
+เอง
+แต่
+แบบ
+แรก
+และ
+แล้ว
+แห่ง
+โดย
+ใน
+ให้
+ได้
+ไป
+ไม่
+ไว้
+\ No newline at end of file
diff --git a/static/stopwords/tl b/static/stopwords/tl

new file mode 100644 (file)

index 0000000..1fb0a91
--- /dev/null
+++ b/static/stopwords/tl
@@ -0,0 +1,147 @@
+akin
+aking
+ako
+alin
+am
+amin
+aming
+ang
+ano
+anumang
+apat
+at
+atin
+ating
+ay
+bababa
+bago
+bakit
+bawat
+bilang
+dahil
+dalawa
+dapat
+din
+dito
+doon
+gagawin
+gayunman
+ginagawa
+ginawa
+ginawang
+gumawa
+gusto
+habang
+hanggang
+hindi
+huwag
+iba
+ibaba
+ibabaw
+ibig
+ikaw
+ilagay
+ilalim
+ilan
+inyong
+isa
+isang
+itaas
+ito
+iyo
+iyon
+iyong
+ka
+kahit
+kailangan
+kailanman
+kami
+kanila
+kanilang
+kanino
+kanya
+kanyang
+kapag
+kapwa
+karamihan
+katiyakan
+katulad
+kaya
+kaysa
+ko
+kong
+kulang
+kumuha
+kung
+laban
+lahat
+lamang
+likod
+lima
+maaari
+maaaring
+maging
+mahusay
+makita
+marami
+marapat
+masyado
+may
+mayroon
+mga
+minsan
+mismo
+mula
+muli
+na
+nabanggit
+naging
+nagkaroon
+nais
+nakita
+namin
+napaka
+narito
+nasaan
+ng
+ngayon
+ni
+nila
+nilang
+nito
+niya
+niyang
+noon
+o
+pa
+paano
+pababa
+paggawa
+pagitan
+pagkakaroon
+pagkatapos
+palabas
+pamamagitan
+panahon
+pangalawa
+para
+paraan
+pareho
+pataas
+pero
+pumunta
+pumupunta
+sa
+saan
+sabi
+sabihin
+sarili
+sila
+sino
+siya
+tatlo
+tayo
+tulad
+tungkol
+una
+walang
+\ No newline at end of file
diff --git a/static/stopwords/tr b/static/stopwords/tr

new file mode 100644 (file)

index 0000000..9fb17f2
--- /dev/null
+++ b/static/stopwords/tr
@@ -0,0 +1,504 @@
+acaba
+acep
+adamakıllı
+adeta
+ait
+altmýþ
+altmış
+altý
+altı
+ama
+amma
+anca
+ancak
+arada
+artýk
+aslında
+aynen
+ayrıca
+az
+açıkça
+açıkçası
+bana
+bari
+bazen
+bazý
+bazı
+başkası
+baţka
+belki
+ben
+benden
+beni
+benim
+beri
+beriki
+beþ
+beş
+beţ
+bilcümle
+bile
+bin
+binaen
+binaenaleyh
+bir
+biraz
+birazdan
+birbiri
+birden
+birdenbire
+biri
+birice
+birileri
+birisi
+birkaç
+birkaçı
+birkez
+birlikte
+birçok
+birçoğu
+birþey
+birþeyi
+birşey
+birşeyi
+birţey
+bitevi
+biteviye
+bittabi
+biz
+bizatihi
+bizce
+bizcileyin
+bizden
+bize
+bizi
+bizim
+bizimki
+bizzat
+boşuna
+bu
+buna
+bunda
+bundan
+bunlar
+bunları
+bunların
+bunu
+bunun
+buracıkta
+burada
+buradan
+burası
+böyle
+böylece
+böylecene
+böylelikle
+böylemesine
+böylesine
+büsbütün
+bütün
+cuk
+cümlesi
+da
+daha
+dahi
+dahil
+dahilen
+daima
+dair
+dayanarak
+de
+defa
+dek
+demin
+demincek
+deminden
+denli
+derakap
+derhal
+derken
+deđil
+değil
+değin
+diye
+diđer
+diğer
+diğeri
+doksan
+dokuz
+dolayı
+dolayısıyla
+doğru
+dört
+edecek
+eden
+ederek
+edilecek
+ediliyor
+edilmesi
+ediyor
+elbet
+elbette
+elli
+emme
+en
+enikonu
+epey
+epeyce
+epeyi
+esasen
+esnasında
+etmesi
+etraflı
+etraflıca
+etti
+ettiği
+ettiğini
+evleviyetle
+evvel
+evvela
+evvelce
+evvelden
+evvelemirde
+evveli
+eđer
+eğer
+fakat
+filanca
+gah
+gayet
+gayetle
+gayri
+gayrı
+gelgelelim
+gene
+gerek
+gerçi
+geçende
+geçenlerde
+gibi
+gibilerden
+gibisinden
+gine
+göre
+gırla
+hakeza
+halbuki
+halen
+halihazırda
+haliyle
+handiyse
+hangi
+hangisi
+hani
+hariç
+hasebiyle
+hasılı
+hatta
+hele
+hem
+henüz
+hep
+hepsi
+her
+herhangi
+herkes
+herkesin
+hiç
+hiçbir
+hiçbiri
+hoş
+hulasaten
+iken
+iki
+ila
+ile
+ilen
+ilgili
+ilk
+illa
+illaki
+imdi
+indinde
+inen
+insermi
+ise
+ister
+itibaren
+itibariyle
+itibarıyla
+iyi
+iyice
+iyicene
+için
+iş
+işte
+iţte
+kadar
+kaffesi
+kah
+kala
+kanýmca
+karşın
+katrilyon
+kaynak
+kaçı
+kelli
+kendi
+kendilerine
+kendini
+kendisi
+kendisine
+kendisini
+kere
+kez
+keza
+kezalik
+keşke
+keţke
+ki
+kim
+kimden
+kime
+kimi
+kimisi
+kimse
+kimsecik
+kimsecikler
+külliyen
+kýrk
+kýsaca
+kırk
+kısaca
+lakin
+leh
+lütfen
+maada
+madem
+mademki
+mamafih
+mebni
+međer
+meğer
+meğerki
+meğerse
+milyar
+milyon
+mu
+mü
+mý
+mı
+nasýl
+nasıl
+nasılsa
+nazaran
+naşi
+ne
+neden
+nedeniyle
+nedenle
+nedense
+nerde
+nerden
+nerdeyse
+nere
+nerede
+nereden
+neredeyse
+neresi
+nereye
+netekim
+neye
+neyi
+neyse
+nice
+nihayet
+nihayetinde
+nitekim
+niye
+niçin
+o
+olan
+olarak
+oldu
+olduklarını
+oldukça
+olduğu
+olduğunu
+olmadı
+olmadığı
+olmak
+olması
+olmayan
+olmaz
+olsa
+olsun
+olup
+olur
+olursa
+oluyor
+on
+ona
+onca
+onculayın
+onda
+ondan
+onlar
+onlardan
+onlari
+onlarýn
+onları
+onların
+onu
+onun
+oracık
+oracıkta
+orada
+oradan
+oranca
+oranla
+oraya
+otuz
+oysa
+oysaki
+pek
+pekala
+peki
+pekçe
+peyderpey
+rağmen
+sadece
+sahi
+sahiden
+sana
+sanki
+sekiz
+seksen
+sen
+senden
+seni
+senin
+siz
+sizden
+sizi
+sizin
+sonra
+sonradan
+sonraları
+sonunda
+tabii
+tam
+tamam
+tamamen
+tamamıyla
+tarafından
+tek
+trilyon
+tüm
+var
+vardı
+vasıtasıyla
+ve
+velev
+velhasıl
+velhasılıkelam
+veya
+veyahut
+ya
+yahut
+yakinen
+yakında
+yakından
+yakınlarda
+yalnız
+yalnızca
+yani
+yapacak
+yapmak
+yaptı
+yaptıkları
+yaptığı
+yaptığını
+yapılan
+yapılması
+yapıyor
+yedi
+yeniden
+yenilerde
+yerine
+yetmiþ
+yetmiş
+yetmiţ
+yine
+yirmi
+yok
+yoksa
+yoluyla
+yüz
+yüzünden
+zarfında
+zaten
+zati
+zira
+çabuk
+çabukça
+çeşitli
+çok
+çokları
+çoklarınca
+çokluk
+çoklukla
+çokça
+çoğu
+çoğun
+çoğunca
+çoğunlukla
+çünkü
+öbür
+öbürkü
+öbürü
+önce
+önceden
+önceleri
+öncelikle
+öteki
+ötekisi
+öyle
+öylece
+öylelikle
+öylemesine
+öz
+üzere
+üç
+þey
+þeyden
+þeyi
+þeyler
+þu
+þuna
+þunda
+þundan
+þunu
+şayet
+şey
+şeyden
+şeyi
+şeyler
+şu
+şuna
+şuncacık
+şunda
+şundan
+şunlar
+şunları
+şunu
+şunun
+şura
+şuracık
+şuracıkta
+şurası
+şöyle
+ţayet
+ţimdi
+ţu
+ţöyle
+\ No newline at end of file
diff --git a/static/stopwords/uk b/static/stopwords/uk

new file mode 100644 (file)

index 0000000..02cb25f
--- /dev/null
+++ b/static/stopwords/uk
@@ -0,0 +1,28 @@
+але
+ви
+вона
+вони
+воно
+він
+в╡д
+з
+й
+коли
+ми
+нам
+про
+та
+ти
+хоча
+це
+цей
+чи
+чого
+що
+як
+яко╞
+із
+інших
+╙
+╞х
+╡
+\ No newline at end of file
diff --git a/static/stopwords/ur b/static/stopwords/ur

new file mode 100644 (file)

index 0000000..3b9eef5
--- /dev/null
+++ b/static/stopwords/ur
@@ -0,0 +1,517 @@
+آئی
+آئے
+آج
+آخر
+آخرکبر
+آدهی
+آًب
+آٹھ
+آیب
+اة
+اخبزت
+اختتبم
+ادھر
+ارد
+اردگرد
+ارکبى
+اش
+اضتعوبل
+اضتعوبلات
+اضطرذ
+اضکب
+اضکی
+اضکے
+اطراف
+اغیب
+افراد
+الگ
+اور
+اوًچب
+اوًچبئی
+اوًچی
+اوًچے
+اى
+اً
+اًذر
+اًہیں
+اٹھبًب
+اپٌب
+اپٌے
+اچھب
+اچھی
+اچھے
+اکثر
+اکٹھب
+اکٹھی
+اکٹھے
+اکیلا
+اکیلی
+اکیلے
+اگرچہ
+اہن
+ایطے
+ایک
+ب
+ت
+تبزٍ
+تت
+تر
+ترتیت
+تریي
+تعذاد
+تن
+تو
+توبم
+توہی
+توہیں
+تٌہب
+تک
+تھب
+تھوڑا
+تھوڑی
+تھوڑے
+تھی
+تھے
+تیي
+ثب
+ثبئیں
+ثبترتیت
+ثبری
+ثبرے
+ثبعث
+ثبلا
+ثبلترتیت
+ثبہر
+ثدبئے
+ثرآں
+ثراں
+ثرش
+ثعذ
+ثغیر
+ثلٌذ
+ثلٌذوثبلا
+ثلکہ
+ثي
+ثٌب
+ثٌبرہب
+ثٌبرہی
+ثٌبرہے
+ثٌبًب
+ثٌذ
+ثٌذکرو
+ثٌذکرًب
+ثٌذی
+ثڑا
+ثڑوں
+ثڑی
+ثڑے
+ثھر
+ثھرا
+ثھراہوا
+ثھرپور
+ثھی
+ثہت
+ثہتر
+ثہتری
+ثہتریي
+ثیچ
+ج
+خب
+خبرہب
+خبرہی
+خبرہے
+خبهوظ
+خبًب
+خبًتب
+خبًتی
+خبًتے
+خبًٌب
+خت
+ختن
+خجکہ
+خص
+خططرذ
+خلذی
+خو
+خواى
+خوًہی
+خوکہ
+خٌبة
+خگہ
+خگہوں
+خگہیں
+خیطب
+خیطبکہ
+در
+درخبت
+درخہ
+درخے
+درزقیقت
+درضت
+دش
+دفعہ
+دلچطپ
+دلچطپی
+دلچطپیبں
+دو
+دور
+دوراى
+دوضرا
+دوضروں
+دوضری
+دوضرے
+دوًوں
+دکھبئیں
+دکھبتب
+دکھبتی
+دکھبتے
+دکھبو
+دکھبًب
+دکھبیب
+دی
+دیب
+دیتب
+دیتی
+دیتے
+دیر
+دیٌب
+دیکھو
+دیکھٌب
+دیکھی
+دیکھیں
+دے
+ر
+راضتوں
+راضتہ
+راضتے
+رریعہ
+رریعے
+رکي
+رکھ
+رکھب
+رکھتب
+رکھتبہوں
+رکھتی
+رکھتے
+رکھی
+رکھے
+رہب
+رہی
+رہے
+ز
+زبصل
+زبضر
+زبل
+زبلات
+زبلیہ
+زصوں
+زصہ
+زصے
+زقبئق
+زقیتیں
+زقیقت
+زکن
+زکویہ
+زیبدٍ
+صبف
+صسیر
+صفر
+صورت
+صورتسبل
+صورتوں
+صورتیں
+ض
+ضبت
+ضبتھ
+ضبدٍ
+ضبرا
+ضبرے
+ضبل
+ضبلوں
+ضت
+ضرور
+ضرورت
+ضروری
+ضلطلہ
+ضوچ
+ضوچب
+ضوچتب
+ضوچتی
+ضوچتے
+ضوچو
+ضوچٌب
+ضوچی
+ضوچیں
+ضکب
+ضکتب
+ضکتی
+ضکتے
+ضکٌب
+ضکی
+ضکے
+ضیذھب
+ضیذھی
+ضیذھے
+ضیکٌڈ
+ضے
+طرف
+طریق
+طریقوں
+طریقہ
+طریقے
+طور
+طورپر
+ظبہر
+ع
+عذد
+عظین
+علاقوں
+علاقہ
+علاقے
+علاوٍ
+عووهی
+غبیذ
+غخص
+غذ
+غروع
+غروعبت
+غے
+فرد
+فی
+ق
+قجل
+قجیلہ
+قطن
+لئے
+لا
+لازهی
+لو
+لوجب
+لوجی
+لوجے
+لوسبت
+لوسہ
+لوگ
+لوگوں
+لڑکپي
+لگتب
+لگتی
+لگتے
+لگٌب
+لگی
+لگیں
+لگے
+لی
+لیب
+لیٌب
+لیں
+لے
+ه
+هتعلق
+هختلف
+هسترم
+هسترهہ
+هسطوش
+هسیذ
+هطئلہ
+هطئلے
+هطبئل
+هطتعول
+هطلق
+هعلوم
+هػتول
+هلا
+هوکي
+هوکٌبت
+هوکٌہ
+هٌبضت
+هڑا
+هڑًب
+هڑے
+هکول
+هگر
+هہرثبى
+هیرا
+هیری
+هیرے
+هیں
+و
+وار
+والے
+وٍ
+ًئی
+ًئے
+ًب
+ًبپطٌذ
+ًبگسیر
+ًطجت
+ًقطہ
+ًو
+ًوخواى
+ًکبلٌب
+ًکتہ
+ًہ
+ًہیں
+ًیب
+ًے
+ٓ آش
+ٹھیک
+پبئے
+پبش
+پبًب
+پبًچ
+پر
+پراًب
+پطٌذ
+پل
+پورا
+پوچھب
+پوچھتب
+پوچھتی
+پوچھتے
+پوچھو
+پوچھوں
+پوچھٌب
+پوچھیں
+پچھلا
+پھر
+پہلا
+پہلی
+پہلےضی
+پہلےضے
+پہلےضےہی
+پیع
+چبر
+چبہب
+چبہٌب
+چبہے
+چلا
+چلو
+چلیں
+چلے
+چکب
+چکی
+چکیں
+چکے
+چھوٹب
+چھوٹوں
+چھوٹی
+چھوٹے
+چھہ
+چیسیں
+ڈھوًڈا
+ڈھوًڈلیب
+ڈھوًڈو
+ڈھوًڈًب
+ڈھوًڈی
+ڈھوًڈیں
+ک
+کئی
+کئے
+کب
+کبفی
+کبم
+کت
+کجھی
+کرا
+کرتب
+کرتبہوں
+کرتی
+کرتے
+کرتےہو
+کررہب
+کررہی
+کررہے
+کرو
+کرًب
+کریں
+کرے
+کطی
+کل
+کن
+کوئی
+کوتر
+کورا
+کوروں
+کورٍ
+کورے
+کوطي
+کوى
+کوًطب
+کوًطی
+کوًطے
+کھولا
+کھولو
+کھولٌب
+کھولی
+کھولیں
+کھولے
+کہ
+کہب
+کہتب
+کہتی
+کہتے
+کہو
+کہوں
+کہٌب
+کہی
+کہیں
+کہے
+کی
+کیب
+کیطب
+کیطرف
+کیطے
+کیلئے
+کیوًکہ
+کیوں
+کیے
+کے
+کےثعذ
+کےرریعے
+گئی
+گئے
+گب
+گرد
+گروٍ
+گروپ
+گروہوں
+گٌتی
+گی
+گیب
+گے
+ہر
+ہن
+ہو
+ہوئی
+ہوئے
+ہوا
+ہوبرا
+ہوبری
+ہوبرے
+ہوتب
+ہوتی
+ہوتے
+ہورہب
+ہورہی
+ہورہے
+ہوضکتب
+ہوضکتی
+ہوضکتے
+ہوًب
+ہوًی
+ہوًے
+ہوچکب
+ہوچکی
+ہوچکے
+ہوگئی
+ہوگئے
+ہوگیب
+ہوں
+ہی
+ہیں
+ہے
+ی
+یقیٌی
+یہ
+یہبں
+\ No newline at end of file
diff --git a/static/stopwords/vi b/static/stopwords/vi

new file mode 100644 (file)

index 0000000..f480ff8
--- /dev/null
+++ b/static/stopwords/vi
@@ -0,0 +1,645 @@
+a ha
+a-lô
+ai
+ai ai
+ai nấy
+alô
+amen
+anh
+bao giờ
+bao lâu
+bao nhiêu
+bao nả
+bay biến
+biết
+biết bao
+biết bao nhiêu
+biết chừng nào
+biết mấy
+biết đâu
+biết đâu chừng
+biết đâu đấy
+bà
+bài
+bác
+bây bẩy
+bây chừ
+bây giờ
+bây nhiêu
+bèn
+béng
+bông
+bạn
+bản
+bất chợt
+bất cứ
+bất giác
+bất kì
+bất kể
+bất kỳ
+bất luận
+bất nhược
+bất quá
+bất thình lình
+bất tử
+bất đồ
+bấy
+bấy chầy
+bấy chừ
+bấy giờ
+bấy lâu
+bấy lâu nay
+bấy nay
+bấy nhiêu
+bập bà bập bõm
+bập bõm
+bắt đầu từ
+bằng
+bằng không
+bằng nấy
+bằng ấy
+bển
+bệt
+bị
+bỏ mẹ
+bỗng
+bỗng chốc
+bỗng dưng
+bỗng không
+bỗng nhiên
+bỗng đâu
+bộ
+bội phần
+bớ
+bởi
+bởi chưng
+bởi nhưng
+bởi thế
+bởi vì
+bởi vậy
+bức
+cao
+cha
+cha chả
+chao ôi
+chiếc
+cho
+cho nên
+cho tới
+cho tới khi
+cho đến
+cho đến khi
+choa
+chu cha
+chui cha
+chung cục
+chung qui
+chung quy
+chung quy lại
+chuyện
+chành chạnh
+chí chết
+chính
+chính là
+chính thị
+chùn chùn
+chùn chũn
+chú
+chú mày
+chú mình
+chúng mình
+chúng ta
+chúng tôi
+chăn chắn
+chăng
+chưa
+chầm chập
+chậc
+chắc
+chắc hẳn
+chẳng lẽ
+chẳng những
+chẳng nữa
+chẳng phải
+chết nỗi
+chết thật
+chết tiệt
+chỉ
+chỉn
+chốc chốc
+chớ
+chớ chi
+chợt
+chủn
+chứ
+chứ lị
+coi bộ
+coi mòi
+con
+cu cậu
+cuốn
+cuộc
+càng
+các
+cái
+cây
+còn
+có
+có chăng là
+có dễ
+có thể
+có vẻ
+cóc khô
+cô
+cô mình
+công nhiên
+cùng
+cùng cực
+cùng nhau
+cùng với
+căn
+căn cắt
+cũng
+cũng như
+cũng vậy
+cũng vậy thôi
+cơ
+cơ chừng
+cơ hồ
+cơ mà
+cơn
+cả
+cả thảy
+cả thể
+cảm ơn
+cần
+cật lực
+cật sức
+cậu
+cổ lai
+của
+cứ
+cứ việc
+cực lực
+do
+do vì
+do vậy
+do đó
+duy
+dào
+dì
+dù cho
+dù rằng
+dưới
+dạ
+dần dà
+dần dần
+dầu sao
+dẫu
+dẫu sao
+dễ sợ
+dễ thường
+dở chừng
+dữ
+em
+giữa
+gì
+hay
+hoàn toàn
+hoặc
+hơn
+hầu hết
+họ
+hỏi
+khi
+khác
+không
+luôn
+là
+làm
+lên
+lúc
+lại
+lần
+lớn
+muốn
+mà
+mình
+mỗi
+một
+một cách
+mới
+mợ
+ngay
+ngay cả
+ngay khi
+ngay lúc
+ngay lập tức
+ngay tức khắc
+ngay từ
+nghe chừng
+nghe đâu
+nghen
+nghiễm nhiên
+nghỉm
+ngoài
+ngoài ra
+ngoải
+ngày
+ngày càng
+ngày ngày
+ngày xưa
+ngày xửa
+ngôi
+ngõ hầu
+ngăn ngắt
+ngươi
+người
+ngọn
+ngọt
+ngộ nhỡ
+nh
+nhau
+nhiên hậu
+nhiều
+nhiệt liệt
+nhung nhăng
+nhà
+nhân dịp
+nhân tiện
+nhé
+nhón nhén
+như
+như chơi
+như không
+như quả
+như thể
+như tuồng
+như vậy
+nhưng
+nhưng mà
+nhược bằng
+nhất
+nhất loạt
+nhất luật
+nhất mực
+nhất nhất
+nhất quyết
+nhất sinh
+nhất thiết
+nhất tâm
+nhất tề
+nhất đán
+nhất định
+nhận
+nhỉ
+nhỡ ra
+những
+những ai
+những như
+nào
+này
+nên
+nên chi
+nó
+nóc
+nói
+năm
+nơi
+nấy
+nếu
+nếu như
+nền
+nọ
+nớ
+nức nở
+nữa
+oai oái
+oái
+pho
+phè
+phóc
+phót
+phăn phắt
+phương chi
+phải
+phải chi
+phải chăng
+phắt
+phỉ phui
+phỏng
+phỏng như
+phốc
+phụt
+phứt
+qua
+qua quít
+qua quýt
+quyết
+quyết nhiên
+quyển
+quá
+quá chừng
+quá lắm
+quá sá
+quá thể
+quá trời
+quá xá
+quá đỗi
+quá độ
+quá ư
+quý hồ
+quả
+quả là
+quả tang
+quả thật
+quả tình
+quả vậy
+quả đúng
+ra
+ra phết
+ra sao
+ra trò
+ren rén
+riu ríu
+riêng
+riệt
+rày
+ráo
+ráo trọi
+rén
+rích
+rón rén
+rút cục
+răng
+rất
+rằng
+rằng là
+rốt cuộc
+rốt cục
+rồi
+rứa
+sa sả
+sao
+sau
+sau chót
+sau cuối
+sau cùng
+sau đó
+so
+song le
+suýt
+sì
+sạch
+sất
+sắp
+sẽ
+số
+số là
+sốt sột
+sở dĩ
+sự
+tanh
+tha hồ
+than ôi
+thanh
+theo
+thi thoảng
+thoạt
+thoạt nhiên
+thoắt
+thuần
+thà
+thà là
+thà rằng
+thành ra
+thành thử
+thái quá
+tháng
+thì
+thì thôi
+thình lình
+thím
+thôi
+thúng thắng
+thương ôi
+thường
+thảo hèn
+thảo nào
+thấy
+thẩy
+thậm
+thậm chí
+thật lực
+thật ra
+thật vậy
+thế
+thế là
+thế mà
+thế nào
+thế nên
+thế ra
+thế thì
+thế à
+thếch
+thỉnh thoảng
+thỏm
+thốc
+thốc tháo
+thốt
+thốt nhiên
+thộc
+thời gian
+thục mạng
+thửa
+thực ra
+thực sự
+thực vậy
+tiếp theo
+tiếp đó
+tiện thể
+toà
+toé khói
+toẹt
+trong
+trên
+trước
+trước kia
+trước nay
+trước tiên
+trước đây
+trước đó
+trếu tráo
+trển
+trệt
+trệu trạo
+trỏng
+trời đất ơi
+trừ phi
+tuy
+tuy nhiên
+tuy rằng
+tuy thế
+tuy vậy
+tuyệt nhiên
+tuần tự
+tuốt luốt
+tuốt tuồn tuột
+tuốt tuột
+tà tà
+tênh
+tít mù
+tò te
+tôi
+tông tốc
+tù tì
+tăm tắp
+tại
+tại vì
+tấm
+tấn
+tất cả
+tất thảy
+tất tần tật
+tất tật
+tắp
+tắp lự
+tọt
+tỏ ra
+tỏ vẻ
+tốc tả
+tối ư
+tột
+tớ
+tới
+tức thì
+tức tốc
+từ
+từng
+tự vì
+tựu trung
+veo
+veo veo
+việc
+vung thiên địa
+vung tàn tán
+vung tán tàn
+và
+vào
+vâng
+vèo
+vì
+vì chưng
+vì thế
+vì vậy
+ví bằng
+ví dù
+ví phỏng
+ví thử
+vô hình trung
+vô kể
+vô luận
+vô vàn
+văng tê
+vạn nhất
+vả chăng
+vả lại
+vẫn
+vậy
+vậy là
+vậy thì
+về
+vị tất
+vốn dĩ
+với
+với lại
+vở
+vụt
+vừa
+vừa mới
+xa xả
+xiết bao
+xon xón
+xoành xoạch
+xoét
+xoẳn
+xoẹt
+xuất kì bất ý
+xuất kỳ bất ý
+xuể
+xuống
+xăm xúi
+xăm xăm
+xăm xắm
+xềnh xệch
+xệp
+à
+à ơi
+ào
+á
+á à
+ái
+ái chà
+ái dà
+áng
+âu là
+ô hay
+ô hô
+ô kê
+ô kìa
+ôi chao
+ôi thôi
+ông
+úi
+úi chà
+úi dào
+ý
+ý chừng
+ý da
+đang
+đi
+điều
+đành đạch
+đáng lí
+đáng lý
+đáng lẽ
+đánh đùng
+đáo để
+đây
+đã
+đó
+được
+đại loại
+đại nhân
+đại phàm
+đại để
+đến
+đến nỗi
+đều
+để
+ơ
+ơ hay
+ơ kìa
+ơi
+ư
+ạ
+ạ ơi
+ấy
+ầu ơ
+ắt
+ắt hẳn
+ắt là
+ối dào
+ối giời
+ối giời ơi
+ồ
+ổng
+ớ
+ờ
+ở
+ở trên
+ủa
+ứ hự
+ứ ừ
+ừ
+ử
+\ No newline at end of file
diff --git a/static/stopwords/yo b/static/stopwords/yo

new file mode 100644 (file)

index 0000000..60572e9
--- /dev/null
+++ b/static/stopwords/yo
@@ -0,0 +1,60 @@
+a
+an
+bá
+bí
+bẹ̀rẹ̀
+fún
+fẹ́
+gbogbo
+inú
+jù
+jẹ
+jẹ́
+kan
+kì
+kí
+kò
+láti
+lè
+lọ
+mi
+mo
+máa
+mọ̀
+ni
+náà
+ní
+nígbà
+nítorí
+nǹkan
+o
+padà
+pé
+púpọ̀
+pẹ̀lú
+rẹ̀
+sì
+sí
+sínú
+ṣ
+ti
+tí
+wà
+wá
+wọn
+wọ́n
+yìí
+àti
+àwọn
+é
+í
+òun
+ó
+ń
+ńlá
+ṣe
+ṣé
+ṣùgbọ́n
+ẹmọ́
+ọjọ́
+ọ̀pọ̀lọpọ̀
+\ No newline at end of file
diff --git a/static/stopwords/zh b/static/stopwords/zh

new file mode 100644 (file)

index 0000000..4ba6f1a
--- /dev/null
+++ b/static/stopwords/zh
@@ -0,0 +1,788 @@
+、
+。
+〈
+〉
+《
+》
+一
+一些
+一何
+一切
+一则
+一方面
+一旦
+一来
+一样
+一般
+一转眼
+七
+万一
+三
+上
+上下
+下
+不
+不仅
+不但
+不光
+不单
+不只
+不外乎
+不如
+不妨
+不尽
+不尽然
+不得
+不怕
+不惟
+不成
+不拘
+不料
+不是
+不比
+不然
+不特
+不独
+不管
+不至于
+不若
+不论
+不过
+不问
+与
+与其
+与其说
+与否
+与此同时
+且
+且不说
+且说
+两者
+个
+个别
+中
+临
+为
+为了
+为什么
+为何
+为止
+为此
+为着
+乃
+乃至
+乃至于
+么
+之
+之一
+之所以
+之类
+乌乎
+乎
+乘
+九
+也
+也好
+也罢
+了
+二
+二来
+于
+于是
+于是乎
+云云
+云尔
+五
+些
+亦
+人
+人们
+人家
+什
+什么
+什么样
+今
+介于
+仍
+仍旧
+从
+从此
+从而
+他
+他人
+他们
+他们们
+以
+以上
+以为
+以便
+以免
+以及
+以故
+以期
+以来
+以至
+以至于
+以致
+们
+任
+任何
+任凭
+会
+似的
+但
+但凡
+但是
+何
+何以
+何况
+何处
+何时
+余外
+作为
+你
+你们
+使
+使得
+例如
+依
+依据
+依照
+便于
+俺
+俺们
+倘
+倘使
+倘或
+倘然
+倘若
+借
+借傥然
+假使
+假如
+假若
+做
+像
+儿
+先不先
+光是
+全体
+全部
+八
+六
+兮
+共
+关于
+关于具体地说
+其
+其一
+其中
+其二
+其他
+其余
+其它
+其次
+具体地说
+具体说来
+兼之
+内
+再
+再其次
+再则
+再有
+再者
+再者说
+再说
+冒
+冲
+况且
+几
+几时
+凡
+凡是
+凭
+凭借
+出于
+出来
+分
+分别
+则
+则甚
+别
+别人
+别处
+别是
+别的
+别管
+别说
+到
+前后
+前此
+前者
+加之
+加以
+即
+即令
+即使
+即便
+即如
+即或
+即若
+却
+去
+又
+又及
+及
+及其
+及至
+反之
+反而
+反过来
+反过来说
+受到
+另
+另一方面
+另外
+另悉
+只
+只当
+只怕
+只是
+只有
+只消
+只要
+只限
+叫
+叮咚
+可
+可以
+可是
+可见
+各
+各个
+各位
+各种
+各自
+同
+同时
+后
+后者
+向
+向使
+向着
+吓
+吗
+否则
+吧
+吧哒
+含
+吱
+呀
+呃
+呕
+呗
+呜
+呜呼
+呢
+呵
+呵呵
+呸
+呼哧
+咋
+和
+咚
+咦
+咧
+咱
+咱们
+咳
+哇
+哈
+哈哈
+哉
+哎
+哎呀
+哎哟
+哗
+哟
+哦
+哩
+哪
+哪个
+哪些
+哪儿
+哪天
+哪年
+哪怕
+哪样
+哪边
+哪里
+哼
+哼唷
+唉
+唯有
+啊
+啐
+啥
+啦
+啪达
+啷当
+喂
+喏
+喔唷
+喽
+嗡
+嗡嗡
+嗬
+嗯
+嗳
+嘎
+嘎登
+嘘
+嘛
+嘻
+嘿
+嘿嘿
+四
+因
+因为
+因了
+因此
+因着
+因而
+固然
+在
+在下
+在于
+地
+基于
+处在
+多
+多么
+多少
+大
+大家
+她
+她们
+好
+如
+如上
+如上所述
+如下
+如何
+如其
+如同
+如是
+如果
+如此
+如若
+始而
+孰料
+孰知
+宁
+宁可
+宁愿
+宁肯
+它
+它们
+对
+对于
+对待
+对方
+对比
+将
+小
+尔
+尔后
+尔尔
+尚且
+就
+就是
+就是了
+就是说
+就算
+就要
+尽
+尽管
+尽管如此
+岂但
+己
+已
+已矣
+巴
+巴巴
+年
+并
+并且
+庶乎
+庶几
+开外
+开始
+归
+归齐
+当
+当地
+当然
+当着
+彼
+彼时
+彼此
+往
+待
+很
+得
+得了
+怎
+怎么
+怎么办
+怎么样
+怎奈
+怎样
+总之
+总的来看
+总的来说
+总的说来
+总而言之
+恰恰相反
+您
+惟其
+慢说
+我
+我们
+或
+或则
+或是
+或曰
+或者
+截至
+所
+所以
+所在
+所幸
+所有
+才
+才能
+打
+打从
+把
+抑或
+拿
+按
+按照
+换句话说
+换言之
+据
+据此
+接着
+故
+故此
+故而
+旁人
+无
+无宁
+无论
+既
+既往
+既是
+既然
+日
+时
+时候
+是
+是以
+是的
+更
+曾
+替
+替代
+最
+月
+有
+有些
+有关
+有及
+有时
+有的
+望
+朝
+朝着
+本
+本人
+本地
+本着
+本身
+来
+来着
+来自
+来说
+极了
+果然
+果真
+某
+某个
+某些
+某某
+根据
+欤
+正值
+正如
+正巧
+正是
+此
+此地
+此处
+此外
+此时
+此次
+此间
+毋宁
+每
+每当
+比
+比及
+比如
+比方
+没奈何
+沿
+沿着
+漫说
+焉
+然则
+然后
+然而
+照
+照着
+犹且
+犹自
+甚且
+甚么
+甚或
+甚而
+甚至
+甚至于
+用
+用来
+由
+由于
+由是
+由此
+由此可见
+的
+的确
+的话
+直到
+相对而言
+省得
+看
+眨眼
+着
+着呢
+矣
+矣乎
+矣哉
+离
+秒
+竟而
+第
+等
+等到
+等等
+简言之
+管
+类如
+紧接着
+纵
+纵令
+纵使
+纵然
+经
+经过
+结果
+给
+继之
+继后
+继而
+综上所述
+罢了
+者
+而
+而且
+而况
+而后
+而外
+而已
+而是
+而言
+能
+能否
+腾
+自
+自个儿
+自从
+自各儿
+自后
+自家
+自己
+自打
+自身
+至
+至于
+至今
+至若
+致
+般的
+若
+若夫
+若是
+若果
+若非
+莫不然
+莫如
+莫若
+虽
+虽则
+虽然
+虽说
+被
+要
+要不
+要不是
+要不然
+要么
+要是
+譬喻
+譬如
+让
+许多
+论
+设使
+设或
+设若
+诚如
+诚然
+该
+说
+说来
+请
+诸
+诸位
+诸如
+谁
+谁人
+谁料
+谁知
+贼死
+赖以
+赶
+起
+起见
+趁
+趁着
+越是
+距
+跟
+较
+较之
+边
+过
+还
+还是
+还有
+还要
+这
+这一来
+这个
+这么
+这么些
+这么样
+这么点儿
+这些
+这会儿
+这儿
+这就是说
+这时
+这样
+这次
+这般
+这边
+这里
+进而
+连
+连同
+逐步
+通过
+遵循
+遵照
+那
+那个
+那么
+那么些
+那么样
+那些
+那会儿
+那儿
+那时
+那样
+那般
+那边
+那里
+都
+鄙人
+鉴于
+针对
+阿
+除
+除了
+除外
+除开
+除此之外
+除非
+随
+随后
+随时
+随着
+难道说
+零
+非
+非但
+非徒
+非特
+非独
+靠
+顺
+顺着
+首先
+︿
+！
+＃
+＄
+％
+＆
+（
+）
+＊
+＋
+，
+０
+１
+２
+３
+４
+５
+６
+７
+８
+９
+：
+；
+＜
+＞
+？
+＠
+［
+］
+｛
+｜
+｝
+～
+￥
+\ No newline at end of file
diff --git a/static/stopwords/zu b/static/stopwords/zu

new file mode 100644 (file)

index 0000000..36c570c
--- /dev/null
+++ b/static/stopwords/zu
@@ -0,0 +1,29 @@
+futhi
+kahle
+kakhulu
+kanye
+khona
+kodwa
+kungani
+kusho
+la
+lakhe
+lapho
+mina
+ngesikhathi
+nje
+phansi
+phezulu
+u
+ukuba
+ukuthi
+ukuze
+uma
+wahamba
+wakhe
+wami
+wase
+wathi
+yakhe
+zakhe
+zonke
+\ No newline at end of file
diff --git a/subprojects/gtest.wrap b/subprojects/gtest.wrap

new file mode 100644 (file)

index 0000000..ba9c9b9
--- /dev/null
+++ b/subprojects/gtest.wrap
@@ -0,0 +1,10 @@
+[wrap-file]
+directory = googletest-release-1.8.1
+
+source_url = https://github.com/google/googletest/archive/release-1.8.1.zip
+source_filename = gtest-1.8.1.zip
+source_hash = 927827c183d01734cc5cfef85e0ff3f5a92ffe6188e0d18e909c5efebf28a0c7
+
+patch_url = https://wrapdb.mesonbuild.com/v1/projects/gtest/1.8.1/1/get_zip
+patch_filename = gtest-1.8.1-1-wrap.zip
+patch_hash = f79f5fd46e09507b3f2e09a51ea6eb20020effe543335f5aee59f30cc8d15805
diff --git a/test/cluster.cpp b/test/cluster.cpp

new file mode 100644 (file)

index 0000000..1a850e7
--- /dev/null
+++ b/test/cluster.cpp
@@ -0,0 +1,423 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <algorithm>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#if defined(_MSC_VER)
+# include <BaseTsd.h>
+  typedef SSIZE_T ssize_t;
+#else
+# include <unistd.h>
+#endif
+
+#ifdef _WIN32
+#include <windows.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <io.h>
+#include <fileapi.h>
+#undef min
+#undef max
+#endif
+
+#include "gtest/gtest.h"
+
+#include <zim/zim.h>
+
+#include "../src/buffer.h"
+#include "../src/cluster.h"
+#include "../src/file_part.h"
+#include "../src/file_compound.h"
+#include "../src/file_reader.h"
+#include "../src/writer/cluster.h"
+#include "../src/endian_tools.h"
+#include "../src/config.h"
+
+namespace
+{
+
+std::shared_ptr<zim::Buffer> write_to_buffer(zim::writer::Cluster& cluster)
+{
+#ifdef _WIN32
+  wchar_t cbase[MAX_PATH];
+  wchar_t ctmp[MAX_PATH];
+  GetTempPathW(MAX_PATH-14, cbase);
+  // This create a file for us, ensure it is unique.
+  // So we need to delete it and create the directory using the same name.
+  GetTempFileNameW(cbase, L"test_cluster", 0, ctmp);
+  auto tmp_fd = _wopen(ctmp, _O_CREAT | _O_TEMPORARY | _O_SHORT_LIVED | _O_RDWR | _O_TRUNC);
+#else
+  char tmpl[] = "/tmp/test_cluster_XXXXXX";
+  auto tmp_fd = mkstemp(tmpl);
+#endif
+  cluster.close();
+  cluster.write(tmp_fd);
+  auto size = lseek(tmp_fd, 0, SEEK_END);
+
+  char* content = new char[size];
+  lseek(tmp_fd, 0, SEEK_SET);
+  if (read(tmp_fd, content, size) == -1)
+    throw std::runtime_error("Cannot read");
+  close(tmp_fd);
+#ifndef _WIN32
+  unlink(tmpl);
+#endif
+  return std::shared_ptr<zim::Buffer>(
+      new zim::MemoryBuffer<true>(content, zim::zsize_t(size)));
+}
+
+TEST(ClusterTest, create_cluster)
+{
+  zim::writer::Cluster cluster(zim::zimcompNone);
+
+  ASSERT_EQ(cluster.count().v, 0U);
+
+  std::string blob0("123456789012345678901234567890");
+  std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+  std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+  cluster.addData(blob0.data(), zim::zsize_t(blob0.size()));
+  cluster.addData(blob1.data(), zim::zsize_t(blob1.size()));
+  cluster.addData(blob2.data(), zim::zsize_t(blob2.size()));
+
+  ASSERT_EQ(cluster.count().v, 3U);
+  ASSERT_EQ(cluster.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+  ASSERT_EQ(cluster.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+  ASSERT_EQ(cluster.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+}
+
+TEST(ClusterTest, read_write_cluster)
+{
+  zim::writer::Cluster cluster(zim::zimcompNone);
+
+  std::string blob0("123456789012345678901234567890");
+  std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+  std::string blob2("abcdefghijklmnop vwxyz");
+
+  cluster.addData(blob0.data(), zim::zsize_t(blob0.size()));
+  cluster.addData(blob1.data(), zim::zsize_t(blob1.size()));
+  cluster.addData(blob2.data(), zim::zsize_t(blob2.size()));
+
+  auto buffer = write_to_buffer(cluster);
+  zim::CompressionType comp;
+  bool extended;
+  auto reader = std::shared_ptr<const zim::Reader>(zim::BufferReader(buffer).sub_clusterReader(zim::offset_t(0), &comp, &extended));
+  zim::Cluster cluster2(reader, zim::zimcompNone, false);
+  ASSERT_EQ(cluster2.count().v, 3U);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+}
+
+TEST(ClusterTest, read_write_empty)
+{
+  zim::writer::Cluster cluster(zim::zimcompNone);
+
+  cluster.addData(0, zim::zsize_t(0));
+  cluster.addData(0, zim::zsize_t(0));
+  cluster.addData(0, zim::zsize_t(0));
+
+  auto buffer = write_to_buffer(cluster);
+  zim::CompressionType comp;
+  bool extended;
+  auto reader = std::shared_ptr<const zim::Reader>(zim::BufferReader(buffer).sub_clusterReader(zim::offset_t(0), &comp, &extended));
+  zim::Cluster cluster2(reader, zim::zimcompNone, false);
+  ASSERT_EQ(cluster2.count().v, 3U);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, 0U);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, 0U);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, 0U);
+}
+
+#if defined(ENABLE_ZLIB)
+TEST(ClusterTest, read_write_clusterZ)
+{
+  zim::writer::Cluster cluster(zim::zimcompZip);
+
+  std::string blob0("123456789012345678901234567890");
+  std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+  std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+  cluster.addData(blob0.data(), zim::zsize_t(blob0.size()));
+  cluster.addData(blob1.data(), zim::zsize_t(blob1.size()));
+  cluster.addData(blob2.data(), zim::zsize_t(blob2.size()));
+
+  auto buffer = write_to_buffer(cluster);
+  zim::CompressionType comp;
+  bool extended;
+  auto reader = std::shared_ptr<const zim::Reader>(zim::BufferReader(buffer).sub_clusterReader(zim::offset_t(0), &comp, &extended));
+  ASSERT_EQ(comp, zim::zimcompZip);
+  ASSERT_EQ(extended, false);
+  zim::Cluster cluster2(reader, comp, extended);
+  ASSERT_EQ(cluster2.count().v, 3U);
+  ASSERT_EQ(cluster2.getCompression(), zim::zimcompZip);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+  auto b = cluster2.getBlob(zim::blob_index_t(0));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob0.data()));
+  b = cluster2.getBlob(zim::blob_index_t(1));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob1.data()));
+  b = cluster2.getBlob(zim::blob_index_t(2));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob2.data()));
+}
+
+#endif
+
+TEST(ClusterTest, read_write_clusterLzma)
+{
+  zim::writer::Cluster cluster(zim::zimcompLzma);
+
+  std::string blob0("123456789012345678901234567890");
+  std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+  std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+  cluster.addData(blob0.data(), zim::zsize_t(blob0.size()));
+  cluster.addData(blob1.data(), zim::zsize_t(blob1.size()));
+  cluster.addData(blob2.data(), zim::zsize_t(blob2.size()));
+
+  auto buffer = write_to_buffer(cluster);
+  zim::CompressionType comp;
+  bool extended;
+  auto reader = std::shared_ptr<const zim::Reader>(zim::BufferReader(buffer).sub_clusterReader(zim::offset_t(0), &comp, &extended));
+  ASSERT_EQ(comp, zim::zimcompLzma);
+  ASSERT_EQ(extended, false);
+  zim::Cluster cluster2(reader, comp, extended);
+  ASSERT_EQ(cluster2.count().v, 3U);
+  ASSERT_EQ(cluster2.getCompression(), zim::zimcompLzma);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+  auto b = cluster2.getBlob(zim::blob_index_t(0));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob0.data()));
+  b = cluster2.getBlob(zim::blob_index_t(1));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob1.data()));
+  b = cluster2.getBlob(zim::blob_index_t(2));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob2.data()));
+}
+
+#if defined(ENABLE_ZSTD)
+TEST(ClusterTest, read_write_clusterZstd)
+{
+  zim::writer::Cluster cluster(zim::zimcompZstd);
+
+  std::string blob0("123456789012345678901234567890");
+  std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+  std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+  cluster.addData(blob0.data(), zim::zsize_t(blob0.size()));
+  cluster.addData(blob1.data(), zim::zsize_t(blob1.size()));
+  cluster.addData(blob2.data(), zim::zsize_t(blob2.size()));
+
+  auto buffer = write_to_buffer(cluster);
+  zim::CompressionType comp;
+  bool extended;
+  auto reader = std::shared_ptr<const zim::Reader>(zim::BufferReader(buffer).sub_clusterReader(zim::offset_t(0), &comp, &extended));
+  ASSERT_EQ(comp, zim::zimcompZstd);
+  ASSERT_EQ(extended, false);
+  zim::Cluster cluster2(reader, comp, extended);
+  ASSERT_EQ(cluster2.count().v, 3U);
+  ASSERT_EQ(cluster2.getCompression(), zim::zimcompZstd);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+  auto b = cluster2.getBlob(zim::blob_index_t(0));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob0.data()));
+  b = cluster2.getBlob(zim::blob_index_t(1));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob1.data()));
+  b = cluster2.getBlob(zim::blob_index_t(2));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob2.data()));
+}
+
+#endif
+
+#if !defined(__APPLE__)
+TEST(ClusterTest, read_write_extended_cluster)
+{
+  //zim::writer doesn't suport 32 bits architectures.
+  if (SIZE_MAX == UINT32_MAX) {
+    return;
+  }
+
+  char* SKIP_BIG_MEMORY_TEST = std::getenv("SKIP_BIG_MEMORY_TEST");
+  if (SKIP_BIG_MEMORY_TEST != nullptr && std::string(SKIP_BIG_MEMORY_TEST) == "1") {
+    return;
+  }
+
+  // MEM = 0
+  std::string blob0("123456789012345678901234567890");
+  std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+  std::string blob2("abcdefghijklmnopqrstuvwxyz");
+  zim::size_type bigger_than_4g = 1024LL*1024LL*1024LL*4LL+1024LL;
+
+  std::shared_ptr<zim::Buffer> buffer;
+  {
+    char* blob3 = nullptr;
+    try {
+      blob3 = new char[bigger_than_4g];
+      // MEM = 4GiB
+    } catch (std::bad_alloc& e) {
+      // Not enough memory, we cannot test cluster bigger than 4Go :(
+      return;
+    }
+
+    {
+      zim::writer::Cluster cluster(zim::zimcompNone);
+      cluster.addData(blob0.data(), zim::zsize_t(blob0.size()));
+      cluster.addData(blob1.data(), zim::zsize_t(blob1.size()));
+      cluster.addData(blob2.data(), zim::zsize_t(blob2.size()));
+      try {
+        cluster.addData(blob3, zim::zsize_t(bigger_than_4g));
+        // MEM = 8GiB
+      } catch (std::bad_alloc& e) {
+        // Not enough memory, we cannot test cluster bigger than 4Go :(
+        delete[] blob3;
+        return;
+      }
+      ASSERT_EQ(cluster.is_extended(), true);
+
+      delete[] blob3;
+      // MEM = 4GiB
+
+      buffer = write_to_buffer(cluster);
+    }
+  }
+  auto reader = std::shared_ptr<zim::Reader>(new zim::BufferReader(buffer));
+  zim::CompressionType comp;
+  bool extended;
+  std::shared_ptr<const zim::Reader> clusterReader
+      = reader->sub_clusterReader(zim::offset_t(0), &comp, &extended);
+  ASSERT_EQ(extended, true);
+  zim::Cluster cluster2(clusterReader, comp, extended);
+  ASSERT_EQ(cluster2.count().v, 4U);
+  ASSERT_EQ(cluster2.getCompression(), zim::zimcompNone);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(3)).v, bigger_than_4g);
+  auto b = cluster2.getBlob(zim::blob_index_t(0));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob0.data()));
+  b = cluster2.getBlob(zim::blob_index_t(1));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob1.data()));
+  b = cluster2.getBlob(zim::blob_index_t(2));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob2.data()));
+}
+#endif
+
+TEST(ClusterTest, read_extended_cluster)
+{
+  std::FILE* tmpfile = std::tmpfile();
+  int fd = fileno(tmpfile);
+  ssize_t bytes_written;
+
+  std::string blob0("123456789012345678901234567890");
+  std::string blob1("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+  std::string blob2("abcdefghijklmnopqrstuvwxyz");
+
+  zim::size_type bigger_than_4g = 1024LL*1024LL*1024LL*4LL+1024LL;
+
+  zim::offset_type offset = 5*sizeof(uint64_t);
+
+  char a = 0x11;
+  bytes_written = write(fd, &a, 1);
+
+  char out_buf[sizeof(uint64_t)];
+
+  zim::toLittleEndian(offset, out_buf);
+  bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+  offset += blob0.size();
+  zim::toLittleEndian(offset, out_buf);
+  bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+  offset += blob1.size();
+  zim::toLittleEndian(offset, out_buf);
+  bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+  offset += blob2.size();
+  zim::toLittleEndian(offset, out_buf);
+  bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+  offset += bigger_than_4g;
+  zim::toLittleEndian(offset, out_buf);
+  bytes_written = write(fd, out_buf, sizeof(uint64_t));
+
+  bytes_written = write(fd, blob0.c_str(), blob0.size());
+  ASSERT_EQ(bytes_written, (ssize_t)blob0.size());
+
+  bytes_written = write(fd, blob1.c_str(), blob1.size());
+  ASSERT_EQ(bytes_written, (ssize_t)blob1.size());
+
+  bytes_written = write(fd, blob2.c_str(), blob2.size());
+  ASSERT_EQ(bytes_written, (ssize_t)blob2.size());
+
+#ifdef _WIN32
+# define LSEEK _lseeki64
+#else
+# define LSEEK lseek
+#endif
+  LSEEK(fd , bigger_than_4g-1, SEEK_CUR);
+#undef LSEEK
+//  std::fseek(tmpfile, bigger_than_4g-1, SEEK_CUR);
+  a = '\0';
+  bytes_written = write(fd, &a, 1);
+  fflush(tmpfile);
+
+  auto filePart = new zim::FilePart<>(fileno(tmpfile));
+  auto fileCompound = std::shared_ptr<zim::FileCompound>(new zim::FileCompound(filePart));
+  auto reader = std::shared_ptr<zim::Reader>(new zim::FileReader(fileCompound));
+  zim::CompressionType comp;
+  bool extended;
+  std::shared_ptr<const zim::Reader> clusterReader
+      = reader->sub_clusterReader(zim::offset_t(0), &comp, &extended);
+  ASSERT_EQ(extended, true);
+  zim::Cluster cluster2(clusterReader, comp, extended);
+  ASSERT_EQ(cluster2.count().v, 4U);
+  ASSERT_EQ(cluster2.getCompression(), zim::zimcompNone);
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(0)).v, blob0.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(1)).v, blob1.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(2)).v, blob2.size());
+  ASSERT_EQ(cluster2.getBlobSize(zim::blob_index_t(3)).v, bigger_than_4g);
+
+
+  auto b = cluster2.getBlob(zim::blob_index_t(0));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob0.data()));
+  b = cluster2.getBlob(zim::blob_index_t(1));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob1.data()));
+  b = cluster2.getBlob(zim::blob_index_t(2));
+  ASSERT_TRUE(std::equal(b.data(), b.end(), blob2.data()));
+
+  b = cluster2.getBlob(zim::blob_index_t(3));
+  if (SIZE_MAX == UINT32_MAX) {
+    ASSERT_EQ(b.data(), nullptr);
+    ASSERT_EQ(b.size(), 0U);
+  } else {
+    ASSERT_EQ(b.size(), bigger_than_4g);
+  }
+
+  fclose(tmpfile);
+}
+
+
+}  // namespace
diff --git a/test/data/wikibooks_be_all_nopic_2017-02.zim b/test/data/wikibooks_be_all_nopic_2017-02.zim

new file mode 100644 (file)

index 0000000..6706e9b

Binary files /dev/null and b/test/data/wikibooks_be_all_nopic_2017-02.zim differ
diff --git a/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimaa b/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimaa

new file mode 100644 (file)

index 0000000..e41c973

Binary files /dev/null and b/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimaa differ
diff --git a/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimab b/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimab

new file mode 100644 (file)

index 0000000..555e8c9

Binary files /dev/null and b/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimab differ
diff --git a/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimac b/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimac

new file mode 100644 (file)

index 0000000..f2e3756

Binary files /dev/null and b/test/data/wikibooks_be_all_nopic_2017-02_splitted.zimac differ
diff --git a/test/data/wikipedia_en_climate_change_nopic_2020-01.zim b/test/data/wikipedia_en_climate_change_nopic_2020-01.zim

new file mode 100644 (file)

index 0000000..0711b5b

Binary files /dev/null and b/test/data/wikipedia_en_climate_change_nopic_2020-01.zim differ
diff --git a/test/dirent.cpp b/test/dirent.cpp

new file mode 100644 (file)

index 0000000..7813c8e
--- /dev/null
+++ b/test/dirent.cpp
@@ -0,0 +1,267 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <cstring>
+#include <iostream>
+#include <sstream>
+#include <memory>
+#include <stdexcept>
+
+#ifdef _WIN32
+#include <windows.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <io.h>
+#include <fileapi.h>
+#endif
+
+#include "gtest/gtest.h"
+
+#include "../src/buffer.h"
+#include "../src/_dirent.h"
+#include "../src/writer/_dirent.h"
+
+namespace
+{
+
+std::unique_ptr<zim::Buffer> write_to_buffer(zim::writer::Dirent& dirent)
+{
+#ifdef _WIN32
+  wchar_t cbase[MAX_PATH];
+  wchar_t ctmp[MAX_PATH];
+  GetTempPathW(MAX_PATH-14, cbase);
+  GetTempFileNameW(cbase, L"test_dirent", 0, ctmp);
+  auto tmp_fd = _wopen(ctmp, _O_CREAT | _O_TEMPORARY | _O_SHORT_LIVED | _O_RDWR | _O_TRUNC);
+#else
+  char tmpl[] = "/tmp/test_dirent_XXXXXX";
+  auto tmp_fd = mkstemp(tmpl);
+#endif
+  dirent.write(tmp_fd);
+  auto size = lseek(tmp_fd, 0, SEEK_END);
+
+  char* content = new char[size];
+  lseek(tmp_fd, 0, SEEK_SET);
+  if (read(tmp_fd, content, size) == -1)
+    throw std::runtime_error("Cannot read");
+
+  close(tmp_fd);
+#ifndef _WIN32
+  unlink(tmpl);
+#endif
+  return std::unique_ptr<zim::Buffer>(
+      new zim::MemoryBuffer<true>(content, zim::zsize_t(size)));
+}
+
+size_t writenDirentSize(const zim::writer::Dirent& dirent)
+{
+#ifdef _WIN32
+  wchar_t cbase[MAX_PATH];
+  wchar_t ctmp[MAX_PATH];
+  GetTempPathW(MAX_PATH-14, cbase);
+  GetTempFileNameW(cbase, L"test_dirent", 0, ctmp);
+  auto tmp_fd = _wopen(ctmp, _O_CREAT | _O_TEMPORARY | _O_SHORT_LIVED | _O_RDWR | _O_TRUNC);
+#else
+  char tmpl[] = "/tmp/test_dirent_XXXXXX";
+  auto tmp_fd = mkstemp(tmpl);
+#endif
+  dirent.write(tmp_fd);
+  auto size = lseek(tmp_fd, 0, SEEK_END);
+  close(tmp_fd);
+#ifndef _WIN32
+  unlink(tmpl);
+#endif
+  return size;
+}
+
+TEST(DirentTest, set_get_data_dirent)
+{
+  zim::Dirent dirent;
+  dirent.setUrl('A', "Bar");
+  dirent.setArticle(17, zim::cluster_index_t(45), zim::blob_index_t(1234));
+  dirent.setVersion(54346);
+
+  ASSERT_TRUE(!dirent.isRedirect());
+  ASSERT_EQ(dirent.getNamespace(), 'A');
+  ASSERT_EQ(dirent.getUrl(), "Bar");
+  ASSERT_EQ(dirent.getTitle(), "Bar");
+  ASSERT_EQ(dirent.getParameter(), "");
+  ASSERT_EQ(dirent.getClusterNumber().v, 45U);
+  ASSERT_EQ(dirent.getBlobNumber().v, 1234U);
+  ASSERT_EQ(dirent.getVersion(), 54346U);
+
+  dirent.setTitle("Foo");
+  ASSERT_EQ(dirent.getNamespace(), 'A');
+  ASSERT_EQ(dirent.getUrl(), "Bar");
+  ASSERT_EQ(dirent.getTitle(), "Foo");
+  ASSERT_EQ(dirent.getParameter(), "");
+}
+
+TEST(DirentTest, read_write_article_dirent)
+{
+  zim::writer::Dirent dirent;
+  dirent.setUrl(zim::writer::Url('A', "Bar"));
+  dirent.setTitle("Foo");
+  dirent.setArticle(17, zim::cluster_index_t(45), zim::blob_index_t(1234));
+
+  ASSERT_TRUE(!dirent.isRedirect());
+  ASSERT_EQ(dirent.getNamespace(), 'A');
+  ASSERT_EQ(dirent.getUrl(), "Bar");
+  ASSERT_EQ(dirent.getTitle(), "Foo");
+  ASSERT_EQ(dirent.getClusterNumber().v, 45U);
+  ASSERT_EQ(dirent.getBlobNumber().v, 1234U);
+  ASSERT_EQ(dirent.getVersion(), 0U);
+
+  auto buffer = write_to_buffer(dirent);
+  zim::Dirent dirent2(std::move(buffer));
+
+  ASSERT_TRUE(!dirent2.isRedirect());
+  ASSERT_EQ(dirent2.getNamespace(), 'A');
+  ASSERT_EQ(dirent2.getTitle(), "Foo");
+  ASSERT_EQ(dirent2.getParameter(), "");
+  ASSERT_EQ(dirent2.getClusterNumber().v, 45U);
+  ASSERT_EQ(dirent2.getBlobNumber().v, 1234U);
+  ASSERT_EQ(dirent2.getVersion(), 0U);
+}
+
+TEST(DirentTest, read_write_article_dirent_unicode)
+{
+  zim::writer::Dirent dirent;
+  dirent.setUrl(zim::writer::Url('A', "L\xc3\xbcliang"));
+  dirent.setArticle(17, zim::cluster_index_t(45), zim::blob_index_t(1234));
+
+  ASSERT_TRUE(!dirent.isRedirect());
+  ASSERT_EQ(dirent.getNamespace(), 'A');
+  ASSERT_EQ(dirent.getUrl(), "L\xc3\xbcliang");
+  ASSERT_EQ(dirent.getTitle(), "L\xc3\xbcliang");
+  ASSERT_EQ(dirent.getClusterNumber().v, 45U);
+  ASSERT_EQ(dirent.getBlobNumber().v, 1234U);
+
+  auto buffer = write_to_buffer(dirent);
+  zim::Dirent dirent2(std::move(buffer));
+
+  ASSERT_TRUE(!dirent2.isRedirect());
+  ASSERT_EQ(dirent2.getNamespace(), 'A');
+  ASSERT_EQ(dirent2.getUrl(), "L\xc3\xbcliang");
+  ASSERT_EQ(dirent2.getTitle(), "L\xc3\xbcliang");
+  ASSERT_EQ(dirent2.getParameter(), "");
+  ASSERT_EQ(dirent2.getClusterNumber().v, 45U);
+  ASSERT_EQ(dirent2.getBlobNumber().v, 1234U);
+}
+
+TEST(DirentTest, read_write_redirect_dirent)
+{
+  zim::writer::Dirent targetDirent;
+  targetDirent.setIdx(zim::article_index_t(321));
+  zim::writer::Dirent dirent;
+  dirent.setUrl(zim::writer::Url('A', "Bar"));
+  dirent.setRedirect(&targetDirent);
+
+  ASSERT_TRUE(dirent.isRedirect());
+  ASSERT_EQ(dirent.getNamespace(), 'A');
+  ASSERT_EQ(dirent.getUrl(), "Bar");
+  ASSERT_EQ(dirent.getRedirectIndex().v, 321U);
+
+  auto buffer = write_to_buffer(dirent);
+  zim::Dirent dirent2(std::move(buffer));
+
+  ASSERT_TRUE(dirent2.isRedirect());
+  ASSERT_EQ(dirent2.getNamespace(), 'A');
+  ASSERT_EQ(dirent2.getUrl(), "Bar");
+  ASSERT_EQ(dirent2.getTitle(), "Bar");
+  ASSERT_EQ(dirent2.getRedirectIndex().v, 321U);
+}
+
+TEST(DirentTest, read_write_linktarget_dirent)
+{
+  zim::writer::Dirent dirent;
+  dirent.setUrl(zim::writer::Url('A', "Bar"));
+  dirent.setLinktarget();
+
+  ASSERT_TRUE(!dirent.isRedirect());
+  ASSERT_TRUE(dirent.isLinktarget());
+  ASSERT_TRUE(!dirent.isDeleted());
+  ASSERT_EQ(dirent.getNamespace(), 'A');
+  ASSERT_EQ(dirent.getUrl(), "Bar");
+
+  auto buffer = write_to_buffer(dirent);
+  zim::Dirent dirent2(std::move(buffer));
+
+  ASSERT_TRUE(!dirent2.isRedirect());
+  ASSERT_TRUE(dirent2.isLinktarget());
+  ASSERT_TRUE(!dirent2.isDeleted());
+  ASSERT_EQ(dirent2.getNamespace(), 'A');
+  ASSERT_EQ(dirent2.getUrl(), "Bar");
+  ASSERT_EQ(dirent2.getTitle(), "Bar");
+}
+
+TEST(DirentTest, read_write_deleted_dirent)
+{
+  zim::writer::Dirent dirent;
+  dirent.setUrl(zim::writer::Url('A', "Bar"));
+  dirent.setDeleted();
+
+  ASSERT_TRUE(!dirent.isRedirect());
+  ASSERT_TRUE(!dirent.isLinktarget());
+  ASSERT_TRUE(dirent.isDeleted());
+  ASSERT_EQ(dirent.getNamespace(), 'A');
+  ASSERT_EQ(dirent.getUrl(), "Bar");
+
+  auto buffer = write_to_buffer(dirent);
+  zim::Dirent dirent2(std::move(buffer));
+
+  ASSERT_TRUE(!dirent2.isRedirect());
+  ASSERT_TRUE(!dirent2.isLinktarget());
+  ASSERT_TRUE(dirent2.isDeleted());
+  ASSERT_EQ(dirent2.getNamespace(), 'A');
+  ASSERT_EQ(dirent2.getUrl(), "Bar");
+  ASSERT_EQ(dirent2.getTitle(), "Bar");
+}
+
+TEST(DirentTest, dirent_size)
+{
+  zim::writer::Dirent dirent;
+  std::string s;
+  dirent.setArticle(17, zim::cluster_index_t(45), zim::blob_index_t(1234));
+  dirent.setUrl(zim::writer::Url('A', "Bar"));
+
+  // case url set, title empty, extralen empty
+  ASSERT_EQ(dirent.getDirentSize(), writenDirentSize(dirent));
+
+  // case url set, title set, extralen empty
+  dirent.setTitle("Foo");
+  ASSERT_EQ(dirent.getDirentSize(), writenDirentSize(dirent));
+
+  // case url set, title empty
+  dirent.setTitle(std::string());
+  ASSERT_EQ(dirent.getDirentSize(), writenDirentSize(dirent));
+}
+
+TEST(DirentTest, redirect_dirent_size)
+{
+  zim::writer::Dirent targetDirent;
+  targetDirent.setIdx(zim::article_index_t(321));
+  zim::writer::Dirent dirent;
+  dirent.setUrl(zim::writer::Url('A', "Bar"));
+  dirent.setRedirect(&targetDirent);
+
+  ASSERT_EQ(dirent.getDirentSize(), writenDirentSize(dirent));
+}
+
+}  // namespace
diff --git a/test/find.cpp b/test/find.cpp

new file mode 100644 (file)

index 0000000..494403f
--- /dev/null
+++ b/test/find.cpp
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2009 Miguel Rocha
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/zim.h>
+#include <zim/file.h>
+#include <zim/error.h>
+#include <zim/fileiterator.h>
+
+#include "gtest/gtest.h"
+
+namespace
+{
+// Not found cases
+
+// ByTitle
+TEST(FindTests, NotFoundByTitle)
+{
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto article1 = file.findByTitle('U', "unkownTitle");
+    auto article2 = file.findByTitle('A', "unkownTitle");
+    ASSERT_EQ(article1->getIndex(), 0);
+    ASSERT_EQ(article2->getIndex(), 7);
+}
+
+// By URL
+TEST(FindTests, NotFoundByURL)
+{
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto article1 = file.find('U', "unkwonUrl");
+    auto article2 = file.find('A', "unkwonUrl");
+    ASSERT_EQ(article1->getIndex(), 0);
+    ASSERT_EQ(article2->getIndex(), 7);
+}
+
+// By URL (no ns)
+TEST(FindTests, NotFoundByURLDefaultNS)
+{
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto article0 = file.find("unkwonUrl");
+    auto article1 = file.find("U/unkwonUrl");
+    auto article2 = file.find("A/unkwonUrl");
+    ASSERT_EQ(article0->getIndex(), 0);
+    ASSERT_EQ(article1->getIndex(), 0);
+    ASSERT_EQ(article2->getIndex(), 7);
+}
+
+// Found cases
+
+// ByTitle
+TEST(FindTests, ByTitle)
+{
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto article1 = file.findByTitle('-', "j/body.js");
+    auto article2 = file.findByTitle('A', "index.html");
+    ASSERT_EQ(article1.getIndex(), 1);
+    ASSERT_EQ(article2->getIndex(), 7);
+}
+
+// By URL
+TEST(FindTests, ByURL)
+{
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto article1 = file.find('-', "j/body.js");
+    auto article2 = file.find('I', "m/115a35549794e50dcd03e60ef1a1ae24.png");
+    ASSERT_EQ(article1->getIndex(), 1);
+    ASSERT_EQ(article2->getIndex(), 76);
+}
+
+// By URL (no ns)
+TEST(FindTests, ByURLDefaultNS)
+{
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto article0 = file.find("A/Main_Page.html");
+    auto article1 = file.find("I/s/ajax-loader.gif");
+    auto article2 = file.find("-/j/head.js");
+    ASSERT_EQ(article0->getIndex(), 5);
+    ASSERT_EQ(article1->getIndex(), 80);
+    ASSERT_EQ(article2->getIndex(), 2);
+}
+
+} // namespace
diff --git a/test/header.cpp b/test/header.cpp

new file mode 100644 (file)

index 0000000..3c1c94c
--- /dev/null
+++ b/test/header.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <stdexcept>
+#ifdef _WIN32
+#include <windows.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <io.h>
+#include <fileapi.h>
+#endif
+
+#include <iostream>
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+#include <zim/fileheader.h>
+
+#include "../src/buffer.h"
+
+namespace
+{
+
+std::shared_ptr<zim::Buffer> write_to_buffer(zim::Fileheader &header)
+{
+#ifdef _WIN32
+  wchar_t cbase[MAX_PATH];
+  wchar_t ctmp[MAX_PATH];
+  GetTempPathW(MAX_PATH-14, cbase);
+  // This create a file for us, ensure it is unique.
+  // So we need to delete it and create the directory using the same name.
+  GetTempFileNameW(cbase, L"test_header", 0, ctmp);
+  auto tmp_fd = _wopen(ctmp, _O_CREAT | _O_TEMPORARY | _O_SHORT_LIVED | _O_RDWR | _O_TRUNC);
+#else
+  char tmpl[] = "/tmp/test_header_XXXXXX";
+  auto tmp_fd = mkstemp(tmpl);
+#endif
+  header.write(tmp_fd);
+  auto size = lseek(tmp_fd, 0, SEEK_END);
+
+  char* content = new char[size];
+  lseek(tmp_fd, 0, SEEK_SET);
+  if (read(tmp_fd, content, size) == -1)
+    throw std::runtime_error("Cannot read");
+  close(tmp_fd);
+#ifndef _WIN32
+  unlink(tmpl);
+#endif
+  return std::shared_ptr<zim::Buffer>(new zim::MemoryBuffer<true>(content, zim::zsize_t(size)));
+}
+
+TEST(HeaderTest, read_write_header)
+{
+  zim::Fileheader header;
+  header.setUuid("1234567890abcdef");
+  header.setArticleCount(4711);
+  header.setUrlPtrPos(12345);
+  header.setTitleIdxPos(23456);
+  header.setClusterCount(14);
+  header.setClusterPtrPos(45678);
+  header.setMainPage(11);
+  header.setLayoutPage(13);
+  header.setMimeListPos(72);
+
+  ASSERT_EQ(header.getUuid(), "1234567890abcdef");
+  ASSERT_EQ(header.getArticleCount(), 4711U);
+  ASSERT_EQ(header.getUrlPtrPos(), 12345U);
+  ASSERT_EQ(header.getTitleIdxPos(), 23456U);
+  ASSERT_EQ(header.getClusterCount(), 14U);
+  ASSERT_EQ(header.getClusterPtrPos(), 45678U);
+  ASSERT_EQ(header.getMainPage(), 11U);
+  ASSERT_EQ(header.getLayoutPage(), 13U);
+  ASSERT_EQ(header.getMimeListPos(), 72U);
+
+  auto buffer = write_to_buffer(header);
+  zim::Fileheader header2;
+  header2.read(buffer);
+
+  ASSERT_EQ(header2.getUuid(), "1234567890abcdef");
+  ASSERT_EQ(header2.getArticleCount(), 4711U);
+  ASSERT_EQ(header2.getUrlPtrPos(), 12345U);
+  ASSERT_EQ(header2.getTitleIdxPos(), 23456U);
+  ASSERT_EQ(header2.getClusterCount(), 14U);
+  ASSERT_EQ(header2.getClusterPtrPos(), 45678U);
+  ASSERT_EQ(header2.getMainPage(), 11U);
+  ASSERT_EQ(header2.getLayoutPage(), 13U);
+}
+
+}  // namespace
diff --git a/test/iterator.cpp b/test/iterator.cpp

new file mode 100644 (file)

index 0000000..04370b8
--- /dev/null
+++ b/test/iterator.cpp
@@ -0,0 +1,125 @@
+/*
+ * Copyright (C) 2009 Miguel Rocha
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/zim.h>
+#include <zim/file.h>
+#include <zim/error.h>
+#include <zim/fileiterator.h>
+
+#include "gtest/gtest.h"
+
+namespace
+{
+
+
+TEST(ClusterIteratorTest, getArticleByClusterOrder)
+{
+    std::vector<zim::article_index_type> expected = {
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 109, 110, 111, 112, 113, 114, 115, 116,
+117, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
+95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108 };
+
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto nbArticles = file.getCountArticles();
+
+    ASSERT_EQ(nbArticles, expected.size());
+
+    for (auto i = 0u; i < nbArticles; i++)
+    {
+        EXPECT_EQ(file.getArticleByClusterOrder(i).getIndex(), expected[i]);
+    }
+}
+
+TEST(getArticle, indexOutOfRange)
+{
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto nbArticles = file.getCountArticles();
+
+    try {
+        file.getArticle(nbArticles);
+        FAIL() << "Should throw exception\n";
+    }  catch (zim::ZimFileFormatError &e) {
+        ASSERT_EQ(e.what(), std::string("article index out of range"));
+    }  catch(...) {
+        FAIL() << "Should throw exception\n";
+    }
+}
+
+// ByTitle
+TEST(IteratorTests, begin)
+{
+    std::vector<zim::article_index_type> expected = {
+0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 109, 110, 111, 112, 113, 114, 115, 116,
+117, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
+95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108 };
+
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto it = file.begin();
+    int i = 0;
+    while (it != file.end())
+    {
+        EXPECT_EQ(it->getIndex(), expected[i]);
+        it++; i++;
+    }
+}
+
+
+// ByTitle
+TEST(IteratorTests, beginByTitle)
+{
+    std::vector<zim::article_index_type> expected = { 0, 1, 2, 3, 4, 5, 7, 8, 9, 10};
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto it = file.beginByTitle();
+
+    int i = 0;
+    while (i < 10)
+    {
+        EXPECT_EQ(it->getIndex(), expected[i]);
+        it++; i++;
+    }
+    std::cout << "\n";
+}
+
+
+// ByUrl
+TEST(IteratorTests, beginByUrl)
+{
+    std::vector<zim::article_index_type> expected = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    zim::File file ("./test/wikibooks_be_all_nopic_2017-02.zim");
+
+    auto it = file.beginByUrl();
+    int i = 0;
+    while (i < 10)
+    {
+        EXPECT_EQ(it->getIndex(), expected[i]);
+        it++; i++;
+    }
+}
+
+} // namespace
diff --git a/test/meson.build b/test/meson.build

new file mode 100644 (file)

index 0000000..424eb31
--- /dev/null
+++ b/test/meson.build
@@ -0,0 +1,31 @@
+
+configure_file(input : 'data/wikibooks_be_all_nopic_2017-02.zim',
+               output : 'wikibooks_be_all_nopic_2017-02.zim',
+               copy: true )
+
+tests = [
+    'cluster',
+    'dirent',
+    'header',
+    'uuid',
+    'template',
+    'iterator',
+    'find'
+]
+
+if gtest_dep.found() and not meson.is_cross_build()
+    foreach test_name : tests
+        test_exe = executable(test_name, [test_name+'.cpp'],
+                              implicit_include_directories: false,
+                              include_directories : [include_directory, src_directory],
+                              link_with : libzim,
+                              link_args: extra_link_args,
+                              dependencies : deps + [gtest_dep],
+                              build_rpath : '$ORIGIN')
+        test(test_name, test_exe, timeout : 60)
+    endforeach
+endif
+
+if get_option('default_library') != 'static' and not meson.is_cross_build()
+    subdir('pytest')
+endif
diff --git a/test/pytest/basic_open_test.py b/test/pytest/basic_open_test.py

new file mode 100644 (file)

index 0000000..3303f04
--- /dev/null
+++ b/test/pytest/basic_open_test.py
@@ -0,0 +1,121 @@
+import libzim_ext as libzim
+import pytest
+
+import os
+from pathlib import Path
+from itertools import product
+import hashlib
+
+DATADIR = Path(__file__).resolve().parent.parent/'data'
+
+@pytest.fixture(params=product(
+    (b'ZIM\x04', b''),
+    (0x00, 0x01, 0x11, 0x30, 0xFF),
+    range(0, 100, 10)
+))
+def wrong_zim(request, tmpdir):
+    prefix, byte, file_size = request.param
+    basename = 'prefix' if prefix else 'noprefix'
+    filename = tmpdir/'{}_{}_{:x}.zim'.format(basename, file_size, byte)
+    with open(str(filename), 'wb') as f:
+        f.write(prefix)
+        f.write(bytes([byte])*file_size)
+    return filename
+
+
+zim_files = filter(lambda p: p.suffix in ('.zim', '.zimaa'), DATADIR.glob('*.zim*'))
+
+
+@pytest.fixture(params=zim_files)
+def existing_zim_file(request):
+    zim_path = request.param
+    return zim_path.with_suffix('.zim')
+
+def gen_empty_zim_content():
+    content = bytes()
+    content += b'ZIM\x04' # Magic
+    content += b'\x05\x00\x00\x00' # Version
+    content += bytes([0])*16 # uuid
+    content += bytes([0])*4 # article count
+    content += bytes([0])*4 # cluster count
+    content += bytes([80] + [0]*7) # url ptr pos
+    content += bytes([80] + [0]*7) # title ptr pos
+    content += bytes([80] + [0]*7) # cluster ptr pos
+    content += bytes([80] + [0]*7) # mimelist ptr pos
+    content += bytes([0])*4 # main page index
+    content += bytes([0])*4 # layout page index
+    content += bytes([80] + [0]*7) # checksum pos
+    md5sum = hashlib.md5(content).digest()
+    content += md5sum
+    return content
+
+
+@pytest.fixture
+def empty_zim_file(tmpdir):
+    filename = tmpdir/'empty.zim'
+    with open(str(filename), 'wb') as f:
+        f.write(gen_empty_zim_content())
+    return filename
+
+
+def _nasty_offset_filter(offset):
+    # Minor version
+    if 6 <= offset < 8:
+        return False
+
+    # uuid
+    if 8 <= offset < 24:
+        return False
+
+    # page and layout index
+    if 64 <= offset < 72:
+        return False
+  
+    return True
+
+@pytest.fixture(params=filter(_nasty_offset_filter, range(80)))
+def nasty_empty_zim_file(request, tmpdir):
+    offset = request.param
+    content = gen_empty_zim_content()
+    content = content[:offset] + b'\xFF' + content[offset+1:]
+    filename = tmpdir/'nasty_empty_{}.zim'.format(offset)
+    with open(str(filename), 'wb') as f:
+        f.write(content)
+    return filename
+
+@pytest.fixture
+def wrong_checksum_empty_zim_file(tmpdir):
+    content = gen_empty_zim_content()
+    content = content[:85] +b'\xFF' + content[86:]
+    filename = tmpdir/'wrong_checksum_empty.zim'
+    with open(str(filename), 'wb') as f:
+        f.write(content)
+    return filename
+
+
+def test_open_wrong_zim(wrong_zim):
+    print("opening {}".format(wrong_zim))
+    with pytest.raises(RuntimeError):
+        libzim.File(str(wrong_zim).encode())
+
+
+def test_open_nasty_empty_zim(nasty_empty_zim_file):
+    print("opening {}".format(nasty_empty_zim_file))
+    with pytest.raises(RuntimeError):
+        libzim.File(str(nasty_empty_zim_file).encode())
+
+
+def test_open_existing_zim(existing_zim_file):
+    print("opening {}".format(existing_zim_file))
+    f = libzim.File(str(existing_zim_file).encode())
+    assert f.verify()
+
+def test_open_empty_zim(empty_zim_file):
+    print("opening {}".format(empty_zim_file))
+    f = libzim.File(str(empty_zim_file).encode())
+    assert f.verify()
+
+def test_verify_wrong_checksum(wrong_checksum_empty_zim_file):
+    print("opening {}".format(wrong_checksum_empty_zim_file))
+    f = libzim.File(str(wrong_checksum_empty_zim_file).encode())
+    assert not f.verify()
diff --git a/test/pytest/meson.build b/test/pytest/meson.build

new file mode 100644 (file)

index 0000000..3081d17
--- /dev/null
+++ b/test/pytest/meson.build
@@ -0,0 +1,21 @@
+
+cython = find_program('cython3', required : false)
+py3_dep = dependency('python3', required : false)
+
+if cython.found() and py3_dep.found()
+  py3_mod = import('python3')
+  py3 = py3_mod.find_python()
+
+  has_pytest = (run_command(py3, ['-m', 'pytest', '--version']).returncode() == 0)
+
+  if has_pytest
+    subdir('wrapper')
+
+    test('pytest',
+      py3,
+      args: ['-m', 'pytest'],
+      env: ['PYTHONPATH=' + pydir],
+      workdir: meson.current_source_dir()
+    )
+  endif
+endif
diff --git a/test/pytest/wrapper/libzim_ext.pyx b/test/pytest/wrapper/libzim_ext.pyx

new file mode 100644 (file)

index 0000000..5e0a4bf
--- /dev/null
+++ b/test/pytest/wrapper/libzim_ext.pyx
@@ -0,0 +1,10 @@
+cimport zim_wrapper as zim
+
+cdef class File:
+    cdef zim.File c_file
+
+    def __cinit__(self, bytes filename):
+        self.c_file = zim.File(filename)
+
+    def verify(self):
+        return self.c_file.verify()
diff --git a/test/pytest/wrapper/meson.build b/test/pytest/wrapper/meson.build

new file mode 100644 (file)

index 0000000..11fc384
--- /dev/null
+++ b/test/pytest/wrapper/meson.build
@@ -0,0 +1,13 @@
+
+pyx_cpp = custom_target('libzim_pyx',
+  output : 'libzim_pyx.cpp',
+  input : 'libzim_ext.pyx',
+  command : [cython, '--gdb', '--cplus', '@INPUT@', '-o', '@OUTPUT@'],
+)
+
+slib = py3_mod.extension_module('libzim_ext', pyx_cpp,
+  dependencies : py3_dep,
+  link_with: libzim)
+
+pydir = meson.current_build_dir()
+
diff --git a/test/pytest/wrapper/zim_wrapper.pxd b/test/pytest/wrapper/zim_wrapper.pxd

new file mode 100644 (file)

index 0000000..52e761a
--- /dev/null
+++ b/test/pytest/wrapper/zim_wrapper.pxd
@@ -0,0 +1,8 @@
+from libcpp.string cimport string
+
+cdef extern from "../../../include/zim/file.h" namespace "zim":
+    cdef cppclass File:
+        File() except +
+        File(string filename) except +
+
+        bint verify()
diff --git a/test/template.cpp b/test/template.cpp

new file mode 100644 (file)

index 0000000..f0a606e
--- /dev/null
+++ b/test/template.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2009 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include "../src/template.h"
+
+#include "gtest/gtest.h"
+
+namespace
+{
+class TemplateTest : public ::testing::Test, private zim::TemplateParser::Event
+{
+ public:
+  std::string result;
+  zim::TemplateParser parser;
+
+  TemplateTest() : parser(this) {}
+
+ private:
+  void onData(const std::string& data) { result += data; }
+
+  void onToken(const std::string& token)
+  {
+    result += "T(";
+    result += token;
+    result += ')';
+  }
+
+  void onLink(char ns, const std::string& title)
+  {
+    result += "L(";
+    result += ns;
+    result += ", ";
+    result += title;
+    result += ')';
+  }
+};
+
+TEST_F(TemplateTest, ZeroTemplate)
+{
+  parser.parse("<html><body><h1>Hi</h1></body></html>");
+  parser.flush();
+
+  ASSERT_EQ(result, "<html><body><h1>Hi</h1></body></html>");
+}
+
+TEST_F(TemplateTest, Token)
+{
+  parser.parse("<html><%content%></html>");
+  parser.flush();
+
+  ASSERT_EQ(result, "<html>T(content)</html>");
+}
+
+TEST_F(TemplateTest, Link)
+{
+  parser.parse("<html><%/A/Article%></html>");
+  parser.flush();
+
+  ASSERT_EQ(result, "<html>L(A, Article)</html>");
+}
+
+}  // namespace
diff --git a/test/uuid.cpp b/test/uuid.cpp

new file mode 100644 (file)

index 0000000..305dbaa
--- /dev/null
+++ b/test/uuid.cpp
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2013 Tommi Maekitalo
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * is provided AS IS, WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, and
+ * NON-INFRINGEMENT.  See the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA
+ *
+ */
+
+#include <zim/uuid.h>
+#include <iostream>
+#include <sstream>
+
+#include "gtest/gtest.h"
+#ifdef _WIN32
+# include <windows.h>
+# include <synchapi.h>
+#else
+# include <unistd.h>
+#endif
+
+namespace
+{
+TEST(UuidTest, construct)
+{
+  zim::Uuid uuid1(
+      "\x01\x23\x45\x67\x89\xab\xcd\xef\x10\x32\x54\x76\x98\xba\xdc\xfe");
+  zim::Uuid uuid2(
+      "\x01\x23\x45\x67\x89\xab\xcd\xe0\x10\x32\x54\x76\x98\xba\xdc\x0e");
+
+  ASSERT_TRUE(uuid1 != uuid2);
+  ASSERT_TRUE(uuid1 != zim::Uuid());
+  ASSERT_TRUE(uuid2 != zim::Uuid());
+
+  ASSERT_EQ(uuid1.data[0], '\x01');
+  ASSERT_EQ(uuid1.data[1], '\x23');
+  ASSERT_EQ(uuid1.data[2], '\x45');
+  ASSERT_EQ(uuid1.data[3], '\x67');
+  ASSERT_EQ(uuid1.data[4], '\x89');
+  ASSERT_EQ(uuid1.data[5], '\xab');
+  ASSERT_EQ(uuid1.data[6], '\xcd');
+  ASSERT_EQ(uuid1.data[7], '\xef');
+  ASSERT_EQ(uuid1.data[8], '\x10');
+  ASSERT_EQ(uuid1.data[9], '\x32');
+  ASSERT_EQ(uuid1.data[10], '\x54');
+  ASSERT_EQ(uuid1.data[11], '\x76');
+  ASSERT_EQ(uuid1.data[12], '\x98');
+  ASSERT_EQ(uuid1.data[13], '\xba');
+  ASSERT_EQ(uuid1.data[14], '\xdc');
+  ASSERT_EQ(uuid1.data[15], '\xfe');
+
+  ASSERT_EQ(uuid2.data[0], '\x01');
+  ASSERT_EQ(uuid2.data[1], '\x23');
+  ASSERT_EQ(uuid2.data[2], '\x45');
+  ASSERT_EQ(uuid2.data[3], '\x67');
+  ASSERT_EQ(uuid2.data[4], '\x89');
+  ASSERT_EQ(uuid2.data[5], '\xab');
+  ASSERT_EQ(uuid2.data[6], '\xcd');
+  ASSERT_EQ(uuid2.data[7], '\xe0');
+  ASSERT_EQ(uuid2.data[8], '\x10');
+  ASSERT_EQ(uuid2.data[9], '\x32');
+  ASSERT_EQ(uuid2.data[10], '\x54');
+  ASSERT_EQ(uuid2.data[11], '\x76');
+  ASSERT_EQ(uuid2.data[12], '\x98');
+  ASSERT_EQ(uuid2.data[13], '\xba');
+  ASSERT_EQ(uuid2.data[14], '\xdc');
+  ASSERT_EQ(uuid2.data[15], '\x0e');
+}
+
+TEST(UuidTest, generate)
+{
+  zim::Uuid uuid1;
+  zim::Uuid uuid2;
+  ASSERT_TRUE(uuid1 == uuid2);
+  ASSERT_TRUE(uuid1 == zim::Uuid());
+  ASSERT_TRUE(uuid2 == zim::Uuid());
+
+  uuid1 = zim::Uuid::generate();
+  ASSERT_TRUE(uuid1 != uuid2);
+  ASSERT_TRUE(uuid1 != zim::Uuid());
+  ASSERT_TRUE(uuid2 == zim::Uuid());
+
+  // Since GNU Mach's clock isn't precise hence the time might be
+  // same during generating uuid1 and uuid2 leading to test
+  // failure. To bring the time difference between 2 sleep for a
+  // second. Thanks to Pino Toscano.
+#ifdef _WIN32
+  Sleep(1000);
+#else
+  sleep(1);
+#endif
+
+  uuid2 = zim::Uuid::generate();
+  ASSERT_TRUE(uuid1 != uuid2);
+  ASSERT_TRUE(uuid1 != zim::Uuid());
+  ASSERT_TRUE(uuid2 != zim::Uuid());
+}
+
+TEST(UuidTest, output)
+{
+  zim::Uuid uuid(
+      "\x55\x0e\x84\x00\xe2\x9b\x41\xd4\xa7\x16\x44\x66\x55\x44\x00\x00");
+  std::ostringstream out;
+  out << uuid;
+  std::string s = out.str();
+  ASSERT_EQ(s, "550e8400-e29b-41d4-a716-446655440000");
+}
+};
author	Kunal Mehta <legoktm@debian.org>
	Wed, 20 May 2020 20:51:33 +0000 (21:51 +0100)
committer	Kunal Mehta <legoktm@debian.org>
	Wed, 20 May 2020 20:51:33 +0000 (21:51 +0100)
.codecov.yml	[new file with mode: 0644]	patch \| blob
.github/FUNDING.yml	[new file with mode: 0644]	patch \| blob
.github/move.yml	[new file with mode: 0644]	patch \| blob
.github/workflows/ci.yml	[new file with mode: 0644]	patch \| blob
.gitignore	[new file with mode: 0644]	patch \| blob
AUTHORS	[new file with mode: 0644]	patch \| blob
COPYING	[new file with mode: 0644]	patch \| blob
ChangeLog	[new file with mode: 0644]	patch \| blob
README.md	[new file with mode: 0644]	patch \| blob
examples/createZimExample.cpp	[new file with mode: 0644]	patch \| blob
examples/meson.build	[new file with mode: 0644]	patch \| blob
include/meson.build	[new file with mode: 0644]	patch \| blob
include/zim/article.h	[new file with mode: 0644]	patch \| blob
include/zim/blob.h	[new file with mode: 0644]	patch \| blob
include/zim/error.h	[new file with mode: 0644]	patch \| blob
include/zim/file.h	[new file with mode: 0644]	patch \| blob
include/zim/fileheader.h	[new file with mode: 0644]	patch \| blob
include/zim/fileiterator.h	[new file with mode: 0644]	patch \| blob
include/zim/search.h	[new file with mode: 0644]	patch \| blob
include/zim/search_iterator.h	[new file with mode: 0644]	patch \| blob
include/zim/uuid.h	[new file with mode: 0644]	patch \| blob
include/zim/writer/article.h	[new file with mode: 0644]	patch \| blob
include/zim/writer/creator.h	[new file with mode: 0644]	patch \| blob
include/zim/writer/url.h	[new file with mode: 0644]	patch \| blob
include/zim/zim.h	[new file with mode: 0644]	patch \| blob
meson.build	[new file with mode: 0644]	patch \| blob
meson_options.txt	[new file with mode: 0644]	patch \| blob
scripts/libzim-compile-resources	[new file with mode: 0755]	patch \| blob
scripts/meson.build	[new file with mode: 0644]	patch \| blob
src/_dirent.h	[new file with mode: 0644]	patch \| blob
src/article.cpp	[new file with mode: 0644]	patch \| blob
src/blob.cpp	[new file with mode: 0644]	patch \| blob
src/buffer.cpp	[new file with mode: 0644]	patch \| blob
src/buffer.h	[new file with mode: 0644]	patch \| blob
src/cache.h	[new file with mode: 0644]	patch \| blob
src/cluster.cpp	[new file with mode: 0644]	patch \| blob
src/cluster.h	[new file with mode: 0644]	patch \| blob
src/compression.cpp	[new file with mode: 0644]	patch \| blob
src/compression.h	[new file with mode: 0644]	patch \| blob
src/config.h.in	[new file with mode: 0644]	patch \| blob
src/debug.h	[new file with mode: 0644]	patch \| blob
src/dirent.cpp	[new file with mode: 0644]	patch \| blob
src/endian_tools.h	[new file with mode: 0644]	patch \| blob
src/envvalue.cpp	[new file with mode: 0644]	patch \| blob
src/envvalue.h	[new file with mode: 0644]	patch \| blob
src/file.cpp	[new file with mode: 0644]	patch \| blob
src/file_compound.cpp	[new file with mode: 0644]	patch \| blob
src/file_compound.h	[new file with mode: 0644]	patch \| blob
src/file_part.h	[new file with mode: 0644]	patch \| blob
src/file_reader.cpp	[new file with mode: 0644]	patch \| blob
src/file_reader.h	[new file with mode: 0644]	patch \| blob
src/fileheader.cpp	[new file with mode: 0644]	patch \| blob
src/fileimpl.cpp	[new file with mode: 0644]	patch \| blob
src/fileimpl.h	[new file with mode: 0644]	patch \| blob
src/fs.h	[new file with mode: 0644]	patch \| blob
src/fs_unix.cpp	[new file with mode: 0644]	patch \| blob
src/fs_unix.h	[new file with mode: 0644]	patch \| blob
src/fs_windows.cpp	[new file with mode: 0644]	patch \| blob
src/fs_windows.h	[new file with mode: 0644]	patch \| blob
src/levenshtein.cpp	[new file with mode: 0644]	patch \| blob
src/levenshtein.h	[new file with mode: 0644]	patch \| blob
src/log.h	[new file with mode: 0644]	patch \| blob
src/md5.c	[new file with mode: 0644]	patch \| blob
src/md5.h	[new file with mode: 0644]	patch \| blob
src/meson.build	[new file with mode: 0644]	patch \| blob
src/search.cpp	[new file with mode: 0644]	patch \| blob
src/search_internal.h	[new file with mode: 0644]	patch \| blob
src/search_iterator.cpp	[new file with mode: 0644]	patch \| blob
src/template.cpp	[new file with mode: 0644]	patch \| blob
src/template.h	[new file with mode: 0644]	patch \| blob
src/tools.cpp	[new file with mode: 0644]	patch \| blob
src/tools.h	[new file with mode: 0644]	patch \| blob
src/uuid.cpp	[new file with mode: 0644]	patch \| blob
src/writer/_dirent.h	[new file with mode: 0644]	patch \| blob
src/writer/article.cpp	[new file with mode: 0644]	patch \| blob
src/writer/cluster.cpp	[new file with mode: 0644]	patch \| blob
src/writer/cluster.h	[new file with mode: 0644]	patch \| blob
src/writer/creator.cpp	[new file with mode: 0644]	patch \| blob
src/writer/creatordata.h	[new file with mode: 0644]	patch \| blob
src/writer/dirent.cpp	[new file with mode: 0644]	patch \| blob
src/writer/direntPool.h	[new file with mode: 0644]	patch \| blob
src/writer/queue.h	[new file with mode: 0644]	patch \| blob
src/writer/workers.cpp	[new file with mode: 0644]	patch \| blob
src/writer/workers.h	[new file with mode: 0644]	patch \| blob
src/writer/xapianIndexer.cpp	[new file with mode: 0644]	patch \| blob
src/writer/xapianIndexer.h	[new file with mode: 0644]	patch \| blob
src/xapian/htmlparse.cc	[new file with mode: 0644]	patch \| blob
src/xapian/htmlparse.h	[new file with mode: 0644]	patch \| blob
src/xapian/myhtmlparse.cc	[new file with mode: 0644]	patch \| blob
src/xapian/myhtmlparse.h	[new file with mode: 0644]	patch \| blob
src/xapian/namedentities.h	[new file with mode: 0644]	patch \| blob
src/zim_types.h	[new file with mode: 0644]	patch \| blob
static/meson.build	[new file with mode: 0644]	patch \| blob
static/resources_list.txt	[new file with mode: 0644]	patch \| blob
static/stopwords/af	[new file with mode: 0644]	patch \| blob
static/stopwords/ar	[new file with mode: 0644]	patch \| blob
static/stopwords/bg	[new file with mode: 0644]	patch \| blob
static/stopwords/bn	[new file with mode: 0644]	patch \| blob
static/stopwords/br	[new file with mode: 0644]	patch \| blob
static/stopwords/ca	[new file with mode: 0644]	patch \| blob
static/stopwords/cs	[new file with mode: 0644]	patch \| blob
static/stopwords/da	[new file with mode: 0644]	patch \| blob
static/stopwords/de	[new file with mode: 0644]	patch \| blob
static/stopwords/el	[new file with mode: 0644]	patch \| blob
static/stopwords/en	[new file with mode: 0644]	patch \| blob
static/stopwords/eo	[new file with mode: 0644]	patch \| blob
static/stopwords/es	[new file with mode: 0644]	patch \| blob
static/stopwords/et	[new file with mode: 0644]	patch \| blob
static/stopwords/eu	[new file with mode: 0644]	patch \| blob
static/stopwords/fa	[new file with mode: 0644]	patch \| blob
static/stopwords/fi	[new file with mode: 0644]	patch \| blob
static/stopwords/fr	[new file with mode: 0644]	patch \| blob
static/stopwords/ga	[new file with mode: 0644]	patch \| blob
static/stopwords/gl	[new file with mode: 0644]	patch \| blob
static/stopwords/ha	[new file with mode: 0644]	patch \| blob
static/stopwords/he	[new file with mode: 0644]	patch \| blob
static/stopwords/hi	[new file with mode: 0644]	patch \| blob
static/stopwords/hr	[new file with mode: 0644]	patch \| blob
static/stopwords/hu	[new file with mode: 0644]	patch \| blob
static/stopwords/hy	[new file with mode: 0644]	patch \| blob
static/stopwords/id	[new file with mode: 0644]	patch \| blob
static/stopwords/it	[new file with mode: 0644]	patch \| blob
static/stopwords/ja	[new file with mode: 0644]	patch \| blob
static/stopwords/ko	[new file with mode: 0644]	patch \| blob
static/stopwords/ku	[new file with mode: 0644]	patch \| blob
static/stopwords/la	[new file with mode: 0644]	patch \| blob
static/stopwords/lt	[new file with mode: 0644]	patch \| blob
static/stopwords/lv	[new file with mode: 0644]	patch \| blob
static/stopwords/mr	[new file with mode: 0644]	patch \| blob
static/stopwords/ms	[new file with mode: 0644]	patch \| blob
static/stopwords/nl	[new file with mode: 0644]	patch \| blob
static/stopwords/no	[new file with mode: 0644]	patch \| blob
static/stopwords/pl	[new file with mode: 0644]	patch \| blob
static/stopwords/pt	[new file with mode: 0644]	patch \| blob
static/stopwords/ro	[new file with mode: 0644]	patch \| blob
static/stopwords/ru	[new file with mode: 0644]	patch \| blob
static/stopwords/sk	[new file with mode: 0644]	patch \| blob
static/stopwords/sl	[new file with mode: 0644]	patch \| blob
static/stopwords/so	[new file with mode: 0644]	patch \| blob
static/stopwords/st	[new file with mode: 0644]	patch \| blob
static/stopwords/sv	[new file with mode: 0644]	patch \| blob
static/stopwords/sw	[new file with mode: 0644]	patch \| blob
static/stopwords/th	[new file with mode: 0644]	patch \| blob
static/stopwords/tl	[new file with mode: 0644]	patch \| blob
static/stopwords/tr	[new file with mode: 0644]	patch \| blob
static/stopwords/uk	[new file with mode: 0644]	patch \| blob
static/stopwords/ur	[new file with mode: 0644]	patch \| blob
static/stopwords/vi	[new file with mode: 0644]	patch \| blob
static/stopwords/yo	[new file with mode: 0644]	patch \| blob
static/stopwords/zh	[new file with mode: 0644]	patch \| blob
static/stopwords/zu	[new file with mode: 0644]	patch \| blob
subprojects/gtest.wrap	[new file with mode: 0644]	patch \| blob
test/cluster.cpp	[new file with mode: 0644]	patch \| blob
test/data/wikibooks_be_all_nopic_2017-02.zim	[new file with mode: 0644]	patch \| blob
test/data/wikibooks_be_all_nopic_2017-02_splitted.zimaa	[new file with mode: 0644]	patch \| blob
test/data/wikibooks_be_all_nopic_2017-02_splitted.zimab	[new file with mode: 0644]	patch \| blob
test/data/wikibooks_be_all_nopic_2017-02_splitted.zimac	[new file with mode: 0644]	patch \| blob
test/data/wikipedia_en_climate_change_nopic_2020-01.zim	[new file with mode: 0644]	patch \| blob
test/dirent.cpp	[new file with mode: 0644]	patch \| blob
test/find.cpp	[new file with mode: 0644]	patch \| blob
test/header.cpp	[new file with mode: 0644]	patch \| blob
test/iterator.cpp	[new file with mode: 0644]	patch \| blob
test/meson.build	[new file with mode: 0644]	patch \| blob
test/pytest/basic_open_test.py	[new file with mode: 0644]	patch \| blob
test/pytest/meson.build	[new file with mode: 0644]	patch \| blob
test/pytest/wrapper/libzim_ext.pyx	[new file with mode: 0644]	patch \| blob
test/pytest/wrapper/meson.build	[new file with mode: 0644]	patch \| blob
test/pytest/wrapper/zim_wrapper.pxd	[new file with mode: 0644]	patch \| blob
test/template.cpp	[new file with mode: 0644]	patch \| blob
test/uuid.cpp	[new file with mode: 0644]	patch \| blob