Coverage details for com.topcoder.document.index.persistence.impl.xml.XmlIndexPersistence

LineHitsSource
1 /*
2  * Copyright (C) 2006 TopCoder Inc., All Rights Reserved.
3  */
4 package com.topcoder.document.index.persistence.impl.xml;
5  
6 import com.topcoder.document.index.CollectionIndex;
7 import com.topcoder.document.index.DocumentIndex;
8 import com.topcoder.document.index.persistence.IndexPersistence;
9 import com.topcoder.document.index.persistence.IndexPersistenceException;
10 import com.topcoder.document.index.persistence.impl.PersistenceConfigurationException;
11 import com.topcoder.document.index.persistence.impl.Utils;
12 import com.topcoder.document.index.wordsource.WordSourceId;
13 import com.topcoder.util.compression.Base64Codec;
14 import com.topcoder.util.compression.Base64Decoder;
15 import com.topcoder.util.compression.Base64Encoder;
16 import org.xml.sax.Attributes;
17 import org.xml.sax.SAXException;
18 import org.xml.sax.helpers.DefaultHandler;
19  
20 import javax.xml.parsers.ParserConfigurationException;
21 import javax.xml.parsers.SAXParser;
22 import javax.xml.parsers.SAXParserFactory;
23 import java.io.BufferedOutputStream;
24 import java.io.File;
25 import java.io.FileFilter;
26 import java.io.FileOutputStream;
27 import java.io.IOException;
28 import java.io.OutputStream;
29 import java.text.CollationKey;
30 import java.text.Collator;
31 import java.text.MessageFormat;
32 import java.text.StringCharacterIterator;
33 import java.util.ArrayList;
34 import java.util.Collection;
35 import java.util.HashMap;
36 import java.util.HashSet;
37 import java.util.Iterator;
38 import java.util.List;
39 import java.util.Locale;
40 import java.util.Map;
41 import java.util.Set;
42 import java.util.zip.DataFormatException;
43  
44  
45 /**
46  * This is a concrete implementation of the IndexPersistence contract that uses XML files as storage medium.
47  * <p/>
48  * This implementation allows for creation of arbitrarily large document index files by sequencing (splitting) large
49  * files into chunks of file based data. Since it is required that binary data is stored in the XML files for document
50  * index a serialized Base64 encoded data will be both encoded and decoded by this implementation.
51  * <p/>
52  * <b>Thread safety:</b>This class is no designed to be thread-safe.
53  *
54  * @author AleaActaEst, TCSDEVELOPER
55  * @version 1.0
56  */
57243public class XmlIndexPersistence implements IndexPersistence {
58  
59     /**
60      * This constant is the file name used to represent a document usage as defined by the persistence contract. There
61      * is only one such file per XML persistence name space.
62      */
63     private static final String DOC_USAGE_FILE_NAME = "document_usage.xml";
64  
65     /**
66      * This is the file name prefix for all files that represent a Collection Index XML file storage (i.e. a document
67      * collection file) Each collection XML file name will have the following format:
68      * <tt>doc_collection_collectionId.xml</tt>.
69      * <p/>
70      * For example here is a simple document collection XML file for a collection with id of <tt>175658</tt>:
71      * <tt>doc_collection_175658.xml</tt>.
72      */
73     private static final String DOC_COLLECTION_FILE_NAME_PREFIX = "doc_collection_";
74  
75     /**
76      * This is the file name prefix for all files that represent a Document Index XML file storage (i.e. a document
77      * index file) Each document index file name will have the following format:<tt>doc_index_docId_sequenceid.xml</tt>.
78      * <p/>
79      * Here is an example of a document index XML file for a document with id of <tt>16527</tt>:
80      * <tt>doc_index_16527_001.xml</tt>.
81      * <p/>
82      * The <tt>sequenceid</tt> is used to ‘link’ files with data that is larger than a configured limit of e.g. 2GB
83      * (this can be configured) this means that if there is indexed data that is larger than say 2GB the file is split
84      * into 2 files with the first having the sequence id of 001 and the second one being obviously 002. This gives the
85      * flexibility for persistence of very large content.
86      */
87     private static final String DOC_INDEX_FILE_NAME_PREFIX = "doc_index_";
88     /**
89      * This is the {@link com.topcoder.util.config.ConfigManager} lookup key used for reading the {@link
90      * #persistenceDirPath} from the ConfigManager.
91      */
92     private static final String XML_PERSISTENCE_PATH_PROPERTY_NAME = "XmlPersistencePath";
93     /**
94      * This is the {@link com.topcoder.util.config.ConfigManager} lookup key used for reading the {@link
95      * #fileSizeThreshold} from the ConfigManager.
96      */
97     private static final String FILE_SIZE_THRESHOLD_PROPERTY_NAME = "FileSizeThreshold";
98     /**
99      * This is the {@link com.topcoder.util.config.ConfigManager} lookup key used for reading the {@link #fileSizeLimit}
100      * from the ConfigManager.
101      */
102     private static final String FILE_SIZE_LIMIT_PROPERTY_NAME = "FileSizeLimit";
103     /**
104      * This constant defines the charset used in XML encoding.
105      */
106     private static final String UTF_8_CHARSET = "UTF-8";
107     /**
108      * This constant defines the factor used in converting kilobyte values into byte-values and megabyte values into
109      * kilobytes.
110      */
111     private static final int KILO_FACTOR = 1024;
112  
113     /**
114      * This is a file size limit for saving a chunk of indexed data. It is measured in Mbytes. Consider data so large
115      * that it cannot fit into a single file due to size limitations for the specific OS or file system. This size limit
116      * is used to decide when to start chaining multiple files to store the data. It is initialized from configuration
117      * through the constructor and once initialized cannot be changed. Must be a positive integer (i.e.
118      * <tt>&gt;0</tt>).
119      */
120     private final int fileSizeLimit;
121  
122     /**
123      * This is a safety net size in Kbytes that is applied before the fileSizeLimit is reached. This is basically
124      * something of a round off error precaution. It is initialized from configuration through the constructor and once
125      * initialized cannot be changed. Must be a non-negative integer (i.e. <tt>&gt;= 0</tt>).
126      */
127     private final int fileSizeThreshold;
128  
129     /**
130      * This is the file path to a directory where all the XML files that make up this persistence session are stored (it
131      * is almost like a data base namespace). This must be a valid fully qualified path. It is initialized from
132      * configuration through the constructor and once initialized cannot be changed. Cannot be <tt>null</tt> or an empty
133      * string.
134      */
135     private final String persistenceDirPath;
136  
137     /**
138      * This is a simple constructor which will populate the fileSizeLimit, fileSizeThreshold, and persistenceDirPath
139      * information from configuration by looking up the values from the given {@link
140      * com.topcoder.util.config.ConfigManager} namespace.
141      *
142      * @param namespace the configuration namespace to be used for looking up the values from the {@link
143      * com.topcoder.util.config.ConfigManager}
144      *
145      * @throws PersistenceConfigurationException
146      * if there are configuration issues encountered, e.g. missing or invalid values
147      * @throws IllegalArgumentException if namespace param is <tt>null</tt> or an empty (trim'd) String
148      */
14984    public XmlIndexPersistence(final String namespace) throws PersistenceConfigurationException {
15084        if (namespace == null) {
1511            throw new IllegalArgumentException("The parameter named [namespace] was null.");
152         }
15383        if (namespace.trim().length() == 0) {
1541            throw new IllegalArgumentException("The parameter named [namespace] was an empty String.");
155         }
15682        fileSizeLimit = Utils.lookupIntFromConfigManager(namespace, FILE_SIZE_LIMIT_PROPERTY_NAME, 1) * KILO_FACTOR
157             * KILO_FACTOR;
15877        fileSizeThreshold = Utils.lookupIntFromConfigManager(namespace, FILE_SIZE_THRESHOLD_PROPERTY_NAME, 0)
159             * KILO_FACTOR;
160         try {
16173            persistenceDirPath = checkPath(
162                 Utils.lookupValidStringFromConfigManager(namespace, XML_PERSISTENCE_PATH_PROPERTY_NAME));
1632        } catch (IllegalArgumentException e) {
1642            throw new PersistenceConfigurationException("The configured persistence directory was not usable.", e);
16569        }
166         //check whether the directory contains a valid document-usages file
167         try {
16869            readDocumentUsages();
1691        } catch (IndexPersistenceException e) {
1701            throw new PersistenceConfigurationException("The persistence directory configured [" + persistenceDirPath
171                 + "] did not contain a valid document_usages.xml.", e);
17268        }
17368    }
174  
175     /**
176      * This is a simple convenience constructor which will populate the fileSizeLimit, fileSizeThreshold, and
177      * persistenceDirPath information directly. The fileSizeLimit and fileSizeThreshold values should be in a meaningful
178      * relation to each other, otherwise it could happen that every word is persisted to a separate file (when
179      * <tt>fileSizeLimit * 1024 &lt;= fileSizeThreshold</tt> ).
180      *
181      * @param fileSizeLimit This is a file size limit in Mbytes for saving a chunk of indexed data, must be
182      * <tt>&gt;0</tt>
183      * @param fileSizeThreshold This is a safety net size in Kbytes that is applied before the fileSizeLimit is
184      * reached, must be <tt>&gt;=0</tt>
185      * @param persistenceDirPath This is the file path to a directory where all the XML files that make up this
186      * persistence session are stored
187      *
188      * @throws IllegalArgumentException in case fileSizeLimit is &lt;1 or fileSizeThreshold is &lt;0 or the given
189      * persistenceDirPath does not denote an existent absolute path of a directory that
190      * is writable and contains a valid document_usage.xml file
191      */
1929    public XmlIndexPersistence(final int fileSizeLimit, final int fileSizeThreshold, final String persistenceDirPath) {
1939        if (fileSizeLimit < 1) {
1941            throw new IllegalArgumentException(
195                 "The parameter named [fileSizeLimit] was expected to be >0 , but was [" + fileSizeLimit + "].");
196         }
1978        if (fileSizeThreshold < 0) {
1981            throw new IllegalArgumentException(
199                 "The parameter named [fileSizeThreshold] was expected to be >=0 , but was [" + fileSizeThreshold
200                     + "].");
201         }
2027        this.fileSizeLimit = fileSizeLimit * KILO_FACTOR * KILO_FACTOR;
2037        this.fileSizeThreshold = fileSizeThreshold * KILO_FACTOR;
2047        this.persistenceDirPath = checkPath(persistenceDirPath);
205         //check whether the directory contains a valid document-usages file
206         try {
2073            readDocumentUsages();
2081        } catch (IndexPersistenceException e) {
2091            final IllegalArgumentException illegalArgumentException = new IllegalArgumentException(
210                 "The persistence directory configured [" + persistenceDirPath
211                     + "] did not contain a valid document_usages.xml.");
2121            illegalArgumentException.initCause(e);
2131            throw illegalArgumentException;
2142        }
2152    }
216  
217     /**
218      * This method adds the given DocumentIndex to the persistence.
219      *
220      * @param documentIndex WordSource representing the document to be indexed, should not be <tt>null</tt>
221      *
222      * @throws IllegalArgumentException when documentIndex is <tt>null</tt>
223      * @throws IndexPersistenceException when the method fails to add DocumentIndex to the persistence
224      */
225     public void addDocumentIndex(final DocumentIndex documentIndex) throws IndexPersistenceException {
22625        if (documentIndex == null) {
2271            throw new IllegalArgumentException("The parameter named [documentIndex] was null.");
228         }
229  
23024        final String documentID = Utils.createIdString(documentIndex.getWordSourceId());
231  
23224        final File firstFile = new File(persistenceDirPath, DOC_INDEX_FILE_NAME_PREFIX + documentID + "_"
233             + 1 + ".xml");
23424        if (firstFile.exists()) {
2351            throw new IndexPersistenceException("The given index does already exist in the persistent storage as file ["
236                 + firstFile.getName() + "].");
237         }
238  
23923        final Locale locale = documentIndex.getLocale();
24023        final WordSourceId sourceId = documentIndex.getWordSourceId();
241  
242         // serialize the source identity and encode it into a Base64 String
24323        final byte[] input = Utils.serializeObject(sourceId.getSourceIdentity());
24423        final byte[] bytes = new byte[input.length * 2];
24523        final Base64Encoder base64Encoder = new Base64Encoder(0, null, Base64Codec.STANDARD_ALPHABET);
24623        base64Encoder.setInput(input);
24723        base64Encoder.finish();
24823        final int size = base64Encoder.deflate(bytes);
24923        final String sourceIdentityString = new String(bytes, 0, size);
250  
251         //build the delimiters list
25223        final String[] delimiters = sourceId.getDelimiters();
25323        final StringBuffer temp = new StringBuffer();
25497        for (int i = 0; i < delimiters.length; i++) {
25574            temp.append("<entry delimiter=\"");
25674            temp.append(escapeForXML(delimiters[i]));
25774            temp.append("\"/>");
258         }
25923        final String delimiterEntries = temp.toString();
260  
261         // this has not been extracted to be a string constant
262         // as the readability and maintainability would be
263         // decreased when separating the format message with
264         // its placeholders from the actual values used
265         // for the placeholders
26623        final String header = MessageFormat.format(
267             "<?xml version=\"1.0\" encoding=\"" + UTF_8_CHARSET + "\" standalone=\"yes\"?>"
268                 + "<doc lang=\"{0}-{1}\">"
269                 + "<document-index document-id=\"{2}\">"
270                 + "<word-source-data>"
271                 + "<source-identity>{3}</source-identity>"
272                 + "<class-name>{4}</class-name>"
273                 + "<locale language=\"{0}\" country=\"{1}\" variant=\"{5}\"/>"
274                 + "<delimiters>{6}</delimiters>"
275                 + "</word-source-data>",
276             new Object[]{
277                 escapeForXML(locale.getLanguage()),
278                 escapeForXML(locale.getCountry()),
279                 escapeForXML(documentID),
280                 sourceIdentityString, // unescaped as Base64 is ASCII only
281                 escapeForXML(sourceId.getSourceClassName()),
282                 escapeForXML(locale.getVariant()),
283                 delimiterEntries // unescaped as already escaped during construction
284             }
285         );
286  
28723        final String footer = "</document-index></doc>";
288  
28923        writeDocumentDataFiles(header, footer, documentID, documentIndex);
290  
291         //insert the entry representing the document into document_usages.xml
29223        final Map map = readDocumentUsages();
29323        map.put(documentID, new DocumentUsage(documentID));
29423        writeDocumentUsages(map);
29523    }
296  
297     /**
298      * This method retrieves a DocumentIndex with the specified WordSourceId from the persistence. It does return
299      * <tt>null</tt>, if DocumentIndex with the given WordSourceId is not found in the persistence.
300      * <p/>
301      * CS section 1.4.4.1 describes the algorithm of this method.
302      *
303      * @param wordSourceId WordSourceId of document to retrieve
304      *
305      * @return the retrieved DocumentIndex, or <tt>null</tt> if document index with given WordSourceId is not found in
306      * the persistence
307      *
308      * @throws IllegalArgumentException when wordSourceId is <tt>null</tt>
309      * @throws IndexPersistenceException when the implementation fails to retrieve document index with the specified
310      * WordSourceId
311      */
312     public DocumentIndex getDocumentIndex(final WordSourceId wordSourceId) throws IndexPersistenceException {
31311        if (wordSourceId == null) {
3141            throw new IllegalArgumentException("The parameter named [wordSourceId] was null.");
315         }
316  
31710        final String documentId = Utils.createIdString(wordSourceId);
318  
31910        return getDocumentIndex(documentId);
320     }
321  
322     /**
323      * This method removes a document index with given WordSourceId from the persistence. When the document index is not
324      * found, or function fail to remove it, an exception is thrown. Function does also not succeed when document index
325      * use count is not zero.
326      * <p/>
327      * CS section 1.4.4.2 describes the algorithm of this method.
328      *
329      * @param wordSourceId WordSourceId of document index to remove
330      *
331      * @throws IllegalArgumentException when WordSourceId is <tt>null</tt>
332      * @throws IndexPersistenceException when document index is not found in the persistence, its use count is not zero,
333      * or error happens when trying to remove it
334      */
335     public void removeDocumentIndex(final WordSourceId wordSourceId) throws IndexPersistenceException {
3365        if (wordSourceId == null) {
3371            throw new IllegalArgumentException("The parameter named [wordSourceId] was null.");
338         }
339  
3404        final int count = getDocumentUseCount(wordSourceId);
3413        if (count != 0) {
3421            throw new IndexPersistenceException("The index with the given id [" + wordSourceId
343                 + "] cannot be deleted as its use count is not 0, but is " + count + ".");
344         }
345  
3462        final String documentId = Utils.createIdString(wordSourceId);
3472        final File[] indexFiles = new File(persistenceDirPath).listFiles(
348             new PrefixSuffixFilter(DOC_INDEX_FILE_NAME_PREFIX + documentId, ".xml"));
3494        for (int i = 0; i < indexFiles.length; i++) {
3502            final File indexFile = indexFiles[i];
3512            indexFile.delete();
352         }
353  
354         // update document_usages.xml
355         // (remove element representing document)
3562        final Map map = readDocumentUsages();
3572        map.remove(documentId);
3582        writeDocumentUsages(map);
3592    }
360  
361     /**
362      * This method stores the given document collection index in the persistence.
363      * <p/>
364      * CS section 1.4.4.4 describes the algorithm of this method.
365      *
366      * @param collectionIndex CollectionIndex to store
367      *
368      * @throws IllegalArgumentException if collectionIndex is <tt>null</tt>
369      * @throws IndexPersistenceException if fails to create and store document collection index or the given index
370      * already exists in persistence or contains unpersisted documents
371      */
372     public void addCollectionIndex(final CollectionIndex collectionIndex)
373         throws IndexPersistenceException {
37411        if (collectionIndex == null) {
3751            throw new IllegalArgumentException("The parameter named [collectionIndex] was null.");
376         }
377  
37810        final String collectionId = collectionIndex.getId();
37910        final File persistenceFile =
380             new File(persistenceDirPath, DOC_COLLECTION_FILE_NAME_PREFIX + collectionId + "_" + ".xml");
381  
38210        if (persistenceFile.exists()) {
3831            throw new IndexPersistenceException("The given Collection index does already exist in persistence as file ["
384                 + persistenceFile.getName() + "].");
385         }
386  
387         //build the documents list
3889        final Set allDocumentIds = collectionIndex.getAllDocumentIds();
389         // this is a set of the string form of the document ids,
390         // re-used when updating the usage information
3919        final Set documentIdStrings = new HashSet();
3929        final StringBuffer temp = new StringBuffer();
3939        for (Iterator iterator = allDocumentIds.iterator(); iterator.hasNext();) {
39411            final WordSourceId sourceId = (WordSourceId) iterator.next();
39511            final String documentId = Utils.createIdString(sourceId);
396  
397             //check whether document has been persisted
39811            final String expectedFileName =
399                 DOC_INDEX_FILE_NAME_PREFIX + documentId + "_" + 1 + ".xml";
40011            if (!new File(persistenceDirPath, expectedFileName).exists()) {
4011                throw new IndexPersistenceException(
402                     "Cannot add collection when its contained documents are not yet persisted (document with id ["
403                         + sourceId + "] was not persisted).");
404             }
405  
40610            documentIdStrings.add(documentId);
40710            temp.append("<document-id>");
40810            temp.append(escapeForXML(documentId));
40910            temp.append("</document-id>");
410         }
4118        final String documentList = temp.toString();
412  
413         // this has not been extracted to be a string constant
414         // as the readability and maintainability would be
415         // decreased when separating the format message with
416         // its placeholders from the actual values used
417         // for the placeholders
4188        final String fileContent = MessageFormat.format(
419             "<?xml version=\"1.0\" encoding=\"" + UTF_8_CHARSET + "\" standalone=\"yes\"?>"
420                 + "<document-collection collection-id=\"{0}\">"
421                 + "<documents>{1}</documents>"
422                 + "</document-collection>",
423             new Object[]{
424                 escapeForXML(collectionId),
425                 documentList // already escaped
426             }
427         );
428  
429         //write out the data to the file
4308        writeUTF8File(fileContent, persistenceFile);
431  
4328        if (!documentIdStrings.isEmpty()) {
433             //update document usages
4348            final Map documentUsages = readDocumentUsages();
4358            for (Iterator iterator = documentIdStrings.iterator(); iterator.hasNext();) {
43610                final String documentId = (String) iterator.next();
43710                DocumentUsage usage = (DocumentUsage) documentUsages.get(documentId);
43810                if (usage == null) {
4390                    usage = new DocumentUsage(documentId);
4400                    documentUsages.put(documentId, usage);
441                 }
44210                usage.add(collectionId);
443             }
4448            writeDocumentUsages(documentUsages);
445         }
4468    }
447  
448     /**
449      * This method retrieves a document collection index with specified identifier. Id does return <tt>null</tt> when
450      * collection with specified identifier is not found
451      * <p/>
452      * CS section 1.4.4.5 describes the algorithm of this method.
453      *
454      * @param collectionId identifier of document collection index to retrieve
455      *
456      * @return CollectionIndex with specified identifier or <tt>null</tt> if the collection index does not exist in
457      * persistence
458      *
459      * @throws IllegalArgumentException when collectionId is <tt>null</tt> or empty string
460      * @throws IndexPersistenceException when collection index can not be retrieved
461      */
462     public CollectionIndex getCollectionIndex(final String collectionId) throws IndexPersistenceException {
46310        if (collectionId == null) {
4641            throw new IllegalArgumentException("The parameter named [wordSourceId] was null.");
465         }
4669        if (collectionId.trim().length() == 0) {
4671            throw new IllegalArgumentException("The parameter named [collectionId] was an empty String.");
468         }
469  
4708        final File persistenceFile =
471             new File(persistenceDirPath, DOC_COLLECTION_FILE_NAME_PREFIX + collectionId + "_" + ".xml");
472  
4738        if (!persistenceFile.exists()) {
4742            return null;
475         }
476  
4776        final DocumentCollectionHandler dh = new DocumentCollectionHandler();
4786        saxParse(persistenceFile, dh);
4796        final String[] documentIds = dh.getDocumentIds();
480  
4816        final Map wordsOfCollection = new HashMap();
4826        final Set documentSourceIds = new HashSet();
483  
48414        for (int i = 0; i < documentIds.length; i++) {
4858            final String documentId = documentIds[i];
4868            final DocumentIndex index = getDocumentIndex(documentId);
4878            final WordSourceId sourceId = index.getWordSourceId();
488  
489             // add all words of the index to the collection's word-index
490             // map and add a reference to the current index
4918            final Set wordsOfIndex = index.getWords().keySet();
4928            for (Iterator iterator = wordsOfIndex.iterator(); iterator.hasNext();) {
49324                final CollationKey collationKey = (CollationKey) iterator.next();
49424                Set documentsCotainingWord = (Set) wordsOfCollection.get(collationKey);
49524                if (documentsCotainingWord == null) {
49618                    documentsCotainingWord = new HashSet();
49718                    wordsOfCollection.put(collationKey, documentsCotainingWord);
498                 }
49924                documentsCotainingWord.add(sourceId);
500             }
501             // add document to collection's contained documents set
5028            documentSourceIds.add(sourceId);
503         }
504         // build the collection object and return it
5056        return new CollectionIndex(null, wordsOfCollection, documentSourceIds, collectionId);
506     }
507  
508     /**
509      * This method does remove document collection index with specified identifier from the persistence.
510      * <p/>
511      * CS section 1.4.4.7 describes the algorithm of this method.
512      *
513      * @param collectionId identifier of document collection index to remove
514      *
515      * @throws IllegalArgumentException when collectionId is <tt>null</tt> or empty string
516      * @throws IndexPersistenceException when fails to remove CollectionIndex with given identifier or the collection
517      * does not exist in persistence
518      */
519     public void removeCollectionIndex(final String collectionId) throws IndexPersistenceException {
5207        if (collectionId == null) {
5211            throw new IllegalArgumentException("The parameter named [collectionId] was null.");
522         }
5236        if (collectionId.trim().length() == 0) {
5241            throw new IllegalArgumentException("The parameter named [collectionId] was an empty String.");
525         }
5265        final File persistenceFile =
527             new File(persistenceDirPath, DOC_COLLECTION_FILE_NAME_PREFIX + collectionId + "_" + ".xml");
528  
5295        if (!persistenceFile.exists()) {
5302            throw new IndexPersistenceException(
531                 "The Collection index with the given id [" + collectionId + "] does not exist in persistence as file ["
532                     + persistenceFile.getName() + "].");
533         }
534         //delete the index file
5353        persistenceFile.delete();
536  
537         //update document usages
5383        final Map documentUsages = readDocumentUsages();
539  
5403        final Collection usages = documentUsages.values();
5413        for (Iterator iterator = usages.iterator(); iterator.hasNext();) {
5424            final DocumentUsage documentUsage = (DocumentUsage) iterator.next();
5434            documentUsage.remove(collectionId);
544         }
545  
5463        writeDocumentUsages(documentUsages);
5473    }
548  
549     /**
550      * This method does update specified CollectionIndex in the persistence.
551      * <p/>
552      * CS section 1.4.4.6 describes the algorithm of this method.
553      *
554      * @param collectionIndex CollectionIndex to update
555      *
556      * @throws IllegalArgumentException if collectionIndex parameter is <tt>null</tt>
557      * @throws IndexPersistenceException if any error happens when updating the collection index in the persistence,
558      * this includes the case, when the specified collectionIndex is not found in the
559      * persistence
560      */
561     public void updateCollectionIndex(final CollectionIndex collectionIndex)
562         throws IndexPersistenceException {
5633        if (collectionIndex == null) {
5641            throw new IllegalArgumentException("The parameter named [collectionIndex] was null.");
565         }
566         // There is no real overhead in implementing the method this
567         // way and it avoids a lot of code duplication too
5682        removeCollectionIndex(collectionIndex.getId());
5691        addCollectionIndex(collectionIndex);
5701    }
571  
572     /**
573      * This method increases the use count value for document index with specified WordSourceId.
574      *
575      * @param wordSourceId WordSourceId of document index which to update use count
576      *
577      * @throws IllegalArgumentException if wordSourceId is <tt>null</tt>
578      * @throws IndexPersistenceException when fails to increase document index use count in the persistence or document
579      * index with id specified does not exist in persistence
580      */
581     public void increaseDocumentUseCount(final WordSourceId wordSourceId)
582         throws IndexPersistenceException {
5838        if (wordSourceId == null) {
5841            throw new IllegalArgumentException("The parameter named [wordSourceId] was null.");
585         }
586  
5877        final String documentId = Utils.createIdString(wordSourceId);
588  
5897        final Map documentUsages = readDocumentUsages();
5907        final DocumentUsage usage = (DocumentUsage) documentUsages.get(documentId);
5917        if (usage == null) {
5921            throw new IndexPersistenceException("The index document with the given word source id [" + wordSourceId
593                 + "], string value is [" + documentId + "] does not exist in the persistence.");
594         }
5956        usage.incUsageCount();
596  
5976        writeDocumentUsages(documentUsages);
5986    }
599  
600     /**
601      * This method decreases the use count value for document index with specified WordSourceId.
602      *
603      * @param wordSourceId WordSourceId of document index of which to update use count
604      *
605      * @throws IllegalArgumentException if wordSourceId is <tt>null</tt>
606      * @throws IndexPersistenceException when fails to decrease document index use count in the persistence or document
607      * index with id specified does not exist in persistence
608      */
609     public void decreaseDocumentUseCount(final WordSourceId wordSourceId)
610         throws IndexPersistenceException {
6117        if (wordSourceId == null) {
6121            throw new IllegalArgumentException("The parameter named [wordSourceId] was null.");
613         }
614  
6156        final String documentId = Utils.createIdString(wordSourceId);
616  
6176        final Map documentUsages = readDocumentUsages();
6186        final DocumentUsage usage = (DocumentUsage) documentUsages.get(documentId);
6196        if (usage == null) {
6201            throw new IndexPersistenceException("The index document with the given word source id [" + wordSourceId
621                 + "], string value is [" + documentId + "] does not exist in the persistence.");
622         }
6235        usage.decUsageCount();
624  
6255        writeDocumentUsages(documentUsages);
6265    }
627  
628     /**
629      * This method does return set of WordSourceId of documents that have been persisted in this persistence instance.
630      * <p/>
631      * CS section 1.4.4.1 describes the algorithm of this method.
632      *
633      * @return set of WordSourceId of documents that have been persisted
634      *
635      * @throws IndexPersistenceException when fails to retrieve the ids
636      */
637     public Set getIndexedDocuments() throws IndexPersistenceException {
6384        final Set ret = new HashSet();
6394        final File[] indexFiles = new File(persistenceDirPath).listFiles(
640             new PrefixSuffixFilter(DOC_INDEX_FILE_NAME_PREFIX, "_1.xml"));
641  
6424        if (indexFiles == null) {
6430            throw new IndexPersistenceException("An error has occurred while scanning the persistence directory ["
644                 + persistenceDirPath + "] for matching persistence files.");
645         }
646  
6478        for (int i = 0; i < indexFiles.length; i++) {
6484            final WordSourceIdHandler dh = new WordSourceIdHandler();
6494            saxParse(indexFiles[i], dh);
6504            ret.add(dh.createWordSourceId());
651         }
6524        return ret;
653     }
654  
655     /**
656      * This method does return the count of how many different groups this document is a member of.
657      *
658      * @param wordSourceId WordSourceId of document index of which to retrieve use count
659      *
660      * @return the use count of the document with the given id
661      *
662      * @throws IllegalArgumentException if wordSourceId is <tt>null</tt>
663      * @throws IndexPersistenceException when fails to retrieve document index use count in the persistence or document
664      * index with id specified does not exist in persistence
665      */
666     public int getDocumentUseCount(final WordSourceId wordSourceId)
667         throws IndexPersistenceException {
66820        if (wordSourceId == null) {
6691            throw new IllegalArgumentException("The parameter named [wordSourceId] was null.");
670         }
671  
67219        final String documentId = Utils.createIdString(wordSourceId);
673  
67419        final Map documentUsages = readDocumentUsages();
67519        final DocumentUsage usage = (DocumentUsage) documentUsages.get(documentId);
67619        if (usage == null) {
6772            throw new IndexPersistenceException("The index document with the given word source id [" + wordSourceId
678                 + "], string value is [" + documentId + "] does not exist in the persistence.");
679         }
68017        return usage.getUsageCount();
681     }
682  
683     /**
684      * This method checks whether the given non-<tt>null</tt> non-empty String denotes a Path to a existent and writable
685      * directory.
686      *
687      * @param path the path to be checked
688      *
689      * @return the checked path
690      *
691      * @throws IllegalArgumentException in case the given path violates any of the rules stated above
692      */
693     private String checkPath(final String path) {
69478        if (path == null) {
6951            throw new IllegalArgumentException("The parameter named [path] was null.");
696         }
69777        if (path.trim().length() == 0) {
6981            throw new IllegalArgumentException("The parameter named [path] was an empty String.");
699         }
700  
70176        final File file = new File(path);
70276        if (!file.exists()) {
7032            throw new IllegalArgumentException("The given path [" + path + "] does not exist.");
704         }
70574        if (!file.isDirectory()) {
7062            throw new IllegalArgumentException("The given path [" + path + "] was not a directory.");
707         }
70872        if (!file.canRead()) {
7090            throw new IllegalArgumentException("The given path [" + path + "] is not readable.");
710         }
71172        if (!file.canWrite()) {
7120            throw new IllegalArgumentException("The given path [" + path + "] is not writable.");
713         }
71472        return path;
715     }
716  
717     /**
718      * This method retrieves a DocumentIndex with the specified documentId (calculated from the wordSourceId) from the
719      * persistence. It does return <tt>null</tt>, if DocumentIndex with the given WordSourceId is not found in the
720      * persistence.
721      * <p/>
722      * CS section 1.4.4.1 describes the algorithm of this method.
723      *
724      * @param documentId id of document to retrieve
725      *
726      * @return the retrieved DocumentIndex, or <tt>null</tt> if document index with given WordSourceId is not found in
727      * the persistence
728      *
729      * @throws IllegalArgumentException when wordSourceId is <tt>null</tt>
730      * @throws IndexPersistenceException when the implementation fails to retrieve document index with the specified
731      * WordSourceId
732      */
733     private DocumentIndex getDocumentIndex(final String documentId) throws IndexPersistenceException {
73418        final File[] indexFiles = new File(persistenceDirPath).listFiles(
735             new PrefixSuffixFilter(DOC_INDEX_FILE_NAME_PREFIX + documentId, ".xml"));
736  
73718        if (indexFiles == null) {
7380            throw new IndexPersistenceException("An error has occurred while scanning the persistence directory ["
739                 + persistenceDirPath + "] for matching persistence files.");
740         }
74118        if (indexFiles.length == 0) {
7422            return null;
743         }
744  
74516        final DocumentIndexHandler dh = new DocumentIndexHandler();
74635        for (int i = 0; i < indexFiles.length; i++) {
74719            final String expectedFileName =
748                 DOC_INDEX_FILE_NAME_PREFIX + documentId + "_" + (i + 1) + ".xml";
74919            final File currentFile = new File(persistenceDirPath, expectedFileName);
75019            if (!currentFile.exists()) {
7510                throw new IndexPersistenceException(
752                     "Corrupt Persistence detected: Expected to encounter file with name [" + expectedFileName
753                         + "], but next file found has name [" + currentFile.getName() + "].");
754  
755             }
75619            saxParse(currentFile, dh);
757         }
758  
75916        return dh.createDocumentIndex();
760     }
761  
762     /**
763      * This method writes the actual document index data files.
764      *
765      * @param header the header XML-fragment string to be used when starting a new file
766      * @param footer the footer XML-fragment string to be used when ending a file
767      * @param documentID the document id calculated from the document index
768      * @param documentIndex the index to be written to the files
769      *
770      * @throws IndexPersistenceException in case the operation fails
771      */
772     private void writeDocumentDataFiles(final String header, final String footer, final String documentID,
773                                         final DocumentIndex documentIndex)
774         throws
775         IndexPersistenceException {
77623        OutputStream outputStream = null;
777         try {
77823            final byte[] headerData = header.getBytes(UTF_8_CHARSET);
77923            final byte[] footerData = footer.getBytes(UTF_8_CHARSET);
780  
781             //start the initial file here
78223            int fileSequence = 1;
78323            outputStream = new BufferedOutputStream(
784                 new FileOutputStream(new File(persistenceDirPath, DOC_INDEX_FILE_NAME_PREFIX + documentID + "_"
785                     + (fileSequence++) + ".xml")));
78623            outputStream.write(headerData);
78723            int bytesWritten = headerData.length;
788  
789             // iterate the words and write them out
79023            final Set set = documentIndex.getWords().entrySet();
79123            for (Iterator iterator = set.iterator(); iterator.hasNext();) {
79211060                final Map.Entry entry = (Map.Entry) iterator.next();
79311060                final List indices = (List) entry.getValue();
79411060                final StringBuffer wordData = new StringBuffer();
79511060                wordData.append("<word-index><word>");
79611060                wordData.append(escapeForXML(((CollationKey) entry.getKey()).getSourceString()));
79711060                wordData.append("</word>");
79811060                for (Iterator iterator1 = indices.iterator(); iterator1.hasNext();) {
79922180                    wordData.append("<pos>");
80022180                    wordData.append((Integer) iterator1.next());
80122180                    wordData.append("</pos>");
802                 }
80311060                wordData.append("</word-index>");
80411060                final byte[] wordEntry = wordData.toString().getBytes(UTF_8_CHARSET);
805  
806                 // we have to start a new file in case
807                 // the data to be written would exceed the file size threshold
80811060                if (bytesWritten + wordEntry.length + footerData.length >= fileSizeLimit - fileSizeThreshold) {
809                     //close previous file
8103                    outputStream.write(footerData);
8113                    outputStream.close();
812  
813                     //create new file
8143                    outputStream = new BufferedOutputStream(new FileOutputStream(new File(persistenceDirPath,
815                         DOC_INDEX_FILE_NAME_PREFIX + documentID + "_"
816                             + (fileSequence++) + ".xml")));
817  
818                     //write header
8193                    outputStream.write(headerData);
8203                    bytesWritten = headerData.length;
821                 }
822  
823                 //now write the word entry data to current file
82411060                outputStream.write(wordEntry);
82511060                bytesWritten += wordEntry.length;
826             }
82723            outputStream.write(footerData);
82823            outputStream.flush();
8290        } catch (IOException e) {
8300            throw new IndexPersistenceException("Error while persisting index [" + documentIndex + "] to file.", e);
831         } finally {
83223            safeClose(outputStream);
83323        }
83423    }
835  
836     /**
837      * This method closes the given output stream, ignoring all exceptions that may occur during this operation .
838      *
839      * @param outputStream the stream to be closed, may be <tt>null</tt>
840      */
841     private static void safeClose(final OutputStream outputStream) {
84278        if (outputStream != null) {
843             try {
84478                outputStream.close();
8450            } catch (IOException e) {
846                 // There is nothing we can do here, as we may have come here due to an exception, we
847                 // cannot re-throw this one as it would mask the original exception.
84878            }
849         }
85078    }
851  
852     /**
853      * This method creates a new SAXParser and lets it parse the given file using the given DefaultHandler.
854      *
855      * @param file the file to be parsed
856      * @param defaultHandler the handler to be used when parsing
857      *
858      * @throws IndexPersistenceException in case the parse operation fails
859      * @throws IllegalArgumentException incase any arg is <tt>null</tt>
860      */
861     private static void saxParse(final File file, final DefaultHandler defaultHandler)
862         throws IndexPersistenceException {
863167        if (file == null) {
8640            throw new IllegalArgumentException("The parameter named [file] was null.");
865         }
866167        if (defaultHandler == null) {
8670            throw new IllegalArgumentException("The parameter named [defaultHandler] was null.");
868         }
869  
870         final SAXParser saxParser;
871         try {
872167            saxParser = SAXParserFactory.newInstance().newSAXParser();
8730        } catch (ParserConfigurationException e) {
8740            throw new IndexPersistenceException("Error while creating SAX parser.", e);
8750        } catch (SAXException e) {
8760            throw new IndexPersistenceException("Error while creating SAX parser.", e);
877167        }
878         try {
879167            saxParser.parse(file, defaultHandler);
8800        } catch (SAXException e) {
8810            throw new IndexPersistenceException("Error while parsing input file [" + file.getName() + "] .",
882                 e);
883  
8840        } catch (IOException e) {
8850            throw new IndexPersistenceException("Error while parsing input file [" + file.getName() + "] .",
886                 e);
887167        }
888167    }
889  
890     /**
891      * This utility method retrieves an attribute value from the given attributes list, throwing a SAXException in case
892      * the attribute does not exist.
893      *
894      * @param attributes the attributes list to retrieve the value from
895      * @param attrName the attribute name
896      * @param tagName the current tag name, used in error messages
897      *
898      * @return the attribute value
899      *
900      * @throws SAXException in case the attribute does not exist
901      */
902     private static String getAttribute(final Attributes attributes, final String attrName, final String tagName)
903         throws SAXException {
904243        final String attributeValue = attributes.getValue(attrName);
905243        if (attributeValue == null) {
9060            throw new SAXException(
907                 "The required attribute [" + attrName + "] of tag [" + tagName + "] was missing.");
908         }
909243        return attributeValue;
910     }
911  
912     /**
913      * This method writes the document_usage.xml from the given map, which is expected to contain values of type {@link
914      * DocumentUsage}. All these values are written to the file.
915      *
916      * @param documentUsages the document usages map to be written
917      *
918      * @throws IndexPersistenceException in case the operation fails
919      */
920     private void writeDocumentUsages(final Map documentUsages) throws IndexPersistenceException {
92147        final StringBuffer fileContent = new StringBuffer();
92247        final Collection usages = documentUsages.values();
92347        fileContent.append("<?xml version=\"1.0\" encoding=\"" + UTF_8_CHARSET + "\" standalone=\"yes\"?><documents>");
92447        for (Iterator iterator = usages.iterator(); iterator.hasNext();) {
92551            final DocumentUsage documentUsage = (DocumentUsage) iterator.next();
92651            fileContent.append("<document-usage document-id=\"");
92751            fileContent.append(escapeForXML(documentUsage.getDocumentId()));
92851            fileContent.append("\" usage-count=\"");
92951            fileContent.append(documentUsage.getUsageCount());
93051            fileContent.append("\"><collection-inclusion>");
93151            final String[] collectionIds = documentUsage.getCollectionIds();
93264            for (int i = 0; i < collectionIds.length; i++) {
93313                final String id = collectionIds[i];
93413                fileContent.append("<collection collection-id=\"");
93513                fileContent.append(escapeForXML(id));
93613                fileContent.append("\"/>");
937             }
93851            fileContent.append("</collection-inclusion>");
93951            fileContent.append("</document-usage>");
940         }
94147        fileContent.append("</documents>");
94247        writeUTF8File(fileContent.toString(), new File(persistenceDirPath, DOC_USAGE_FILE_NAME));
94347    }
944  
945     /**
946      * This method writes the given String to the given file, encoding the content as UTF-8.
947      *
948      * @param fileContent the content to be written
949      * @param persistenceFile the file to write to
950      *
951      * @throws IndexPersistenceException in case the operation fails
952      */
953     private static void writeUTF8File(final String fileContent, final File persistenceFile)
954         throws IndexPersistenceException {
95555        OutputStream outputStream = null;
956         try {
95755            final byte[] fileData = fileContent.getBytes(UTF_8_CHARSET);
95855            outputStream = new FileOutputStream(persistenceFile);
95955            outputStream.write(fileData);
9600        } catch (IOException e) {
9610            throw new IndexPersistenceException(
962                 "Error while writing persistence file [" + persistenceFile.getName() + "].", e);
963         } finally {
96455            safeClose(outputStream);
96555        }
96655    }
967  
968     /**
969      * This method does parse the document_usage.xml file. The content is returned as a map containing String keys being
970      * the documentIds and values of type {@link DocumentUsage}.
971      *
972      * @return the map representing the content of the document_usage.xml file
973      *
974      * @throws IndexPersistenceException in case the operation fails
975      */
976     private Map readDocumentUsages() throws IndexPersistenceException {
977140        final File docUsageFile = new File(persistenceDirPath, DOC_USAGE_FILE_NAME);
978140        if (docUsageFile.exists()) {
979138            final DocumentUsageHandler dh = new DocumentUsageHandler();
980138            saxParse(docUsageFile, dh);
981138            return dh.getDocumentUsages();
982         } else {
9832            throw new IndexPersistenceException("The document usage file,which was expected to exist at ["
984                 + docUsageFile.getAbsolutePath() + "] did not exist.");
985         }
986     }
987  
988     /**
989      * Replace characters having special meaning in XML with their escaped equivalents, using character entities such as
990      * <tt>'&amp;amp;'</tt> or numeric entities such as<tt>'&amp;#12;'</tt>.
991      * <p/>
992      * The filtering whether a character needs to be escaped is done with respect to the fact that the output is written
993      * as UTF-8 encoding, i.e. two-byte characters do not need to be escaped.
994      * <p/>
995      * See: <ul><li>http://www.w3.org/TR/2000/REC-xml-20001006#syntax</li>
996      * <p/>
997      * <li>http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</li>
998      * <p/>
999      * <li>http://www.w3.org/TR/2000/REC-xml-20001006#sec-entexpand</li></ul>
1000      *
1001      * @param string the String to be escaped
1002      *
1003      * @return the escaped version of the given String
1004      */
1005     private static String escapeForXML(final String string) {
100611331        final StringBuffer result = new StringBuffer();
1007  
100811331        final StringCharacterIterator iterator = new StringCharacterIterator(string);
100911331        char ch = iterator.current();
1010100604        while (ch != StringCharacterIterator.DONE) {
101189273            switch (ch) {
1012             case '<':
101320                result.append("&lt;");
101420                break;
1015             case '>':
101620                result.append("&gt;");
101720                break;
1018             case '"':
101920                result.append("&quot;");
102020                break;
1021             case '\'':
102219                result.append("&apos;");
102319                break;
1024             case '&':
10251                result.append("&amp;");
10261                break;
1027             default:
1028                 // If the character is not printable, print as character reference.
1029                 // Non printables are below ASCII space
103089193                if (ch < ' ' || ch == '\u00f7') {
10314                    result.append("&#");
10324                    result.append(Integer.toString(ch));
10334                    result.append(';');
1034                 } else {
103589189                    result.append(ch);
1036                 }
1037             }
1038  
103989273            ch = iterator.next();
1040         }
104111331        return result.toString();
1042     }
1043  
1044     /**
1045      * This is a simple file filter that matches all files with the given prefix and suffix.
1046      */
1047     private static class PrefixSuffixFilter implements FileFilter {
1048         /**
1049          * This is the prefix to be matched by the filter.
1050          */
1051         private final String prefix;
1052         /**
1053          * This is the suffix to be matched by the filter.
1054          */
1055         private final String suffix;
1056  
1057         /**
1058          * Creates new PrefixSuffixFilter that matches the given prefix and suffix.
1059          *
1060          * @param prefix the prefix to be matched
1061          * @param suffix the suffix to be matched
1062          *
1063          * @throws IllegalArgumentException if any arg is <tt>null</tt>
1064          */
1065         private PrefixSuffixFilter(final String prefix, final String suffix) {
1066             if (prefix == null) {
1067                 throw new IllegalArgumentException("The parameter named [prefix] was null.");
1068             }
1069             if (suffix == null) {
1070                 throw new IllegalArgumentException("The parameter named [suffix] was null.");
1071             }
1072  
1073             this.prefix = prefix;
1074             this.suffix = suffix;
1075         }
1076  
1077         /**
1078          * Tests whether or not the specified abstract path name should be included in a path name list.
1079          *
1080          * @param pathname The abstract path name to be tested
1081          *
1082          * @return <code>true</code> if and only if <code>pathname</code> should be included
1083          */
1084         public boolean accept(final File pathname) {
1085             final String name = pathname.getName();
1086             return name.startsWith(prefix) && name.endsWith(suffix);
1087         }
1088     }
1089  
1090     /**
1091      * This DefaultHandler instance is used to parse document index XML files.
1092      */
1093     private static class DocumentIndexHandler extends WordSourceIdHandler {
1094         /**
1095          * This is a state flag which signals whether we are between the start and the end of an <em>word</em>. This
1096          * information is used in {@link #characters(char[], int, int)} to determine whether the characters are child
1097          * element of that element or not.
1098          */
1099         private boolean inWord = false;
1100         /**
1101          * This is a state flag which signals whether we are between the start and the end of an <em>pos</em>. This
1102          * information is used in {@link #characters(char[], int, int)} to determine whether the characters are child
1103          * element of that element or not.
1104          */
1105         private boolean inPos = false;
1106  
1107         /**
1108          * This is the collator instance matching the document index's locale. It is used for converting words to {@link
1109          * CollationKey}s. as this instance is constructed after the <em>word-source-data</em> element has ended, the
1110          * case that the field is not <tt>null</tt> any more does furthermore signal that a <em>word-source-data</em>
1111          * element has been parsed fully by the super class and thus no more event must be delegated to the super
1112          * class.
1113          */
1114         private Collator collator = null;
1115  
1116         /**
1117          * This field contains the parsed words of the index as a map from {@link java.text.CollationKey} to a {@link
1118          * java.util.List} of {@link Integer}s being the indices.
1119          */
1120  
1121         private Map words = new HashMap();
1122  
1123         /**
1124          * This field contains the currently parsed word, if any.
1125          */
1126         private String currentWord = null;
1127         /**
1128          * This field contains the currently parsed indices, if any.
1129          */
1130         private List currentIndices = null;
1131         /**
1132          * This field contains the currently parsed pos value.
1133          */
1134         private String currentPos = null;
1135  
1136         /**
1137          * This method is called by the parser when a document start element is encountered.
1138          *
1139          * @param uri the uri of the element encountered
1140          * @param localName the local name of the element encountered
1141          * @param qName the qualified name of the element encountered
1142          * @param attributes the attributes of the element encountered
1143          *
1144          * @throws SAXException in case some error occurs
1145          */
1146         public void startElement(final String uri, final String localName, final String qName,
1147                                  final Attributes attributes)
1148             throws SAXException {
1149             //delegate to super class as long as no full word-source-data element has been parsed
1150             if (collator == null) {
1151                 super.startElement(uri, localName, qName, attributes);
1152             }
1153  
1154             if ("pos".equals(qName)) {
1155                 inPos = true;
1156             } else if ("word".equals(qName)) {
1157                 inWord = true;
1158             } else if ("word-index".equals(qName)) {
1159                 if (collator == null) {
1160                     throw new SAXException(
1161                         "There was no [word-source-data] specified prior to the [word-index] element.");
1162                 }
1163                 currentIndices = new ArrayList();
1164             }
1165         }
1166  
1167         /**
1168          * This method is called by the parser when a TEXT element is encountered.
1169          *
1170          * @param ch the characters encountered
1171          * @param start the start index in ch
1172          * @param length the length in ch
1173          *
1174          * @throws SAXException in case some error occurs
1175          */
1176         public void characters(final char[] ch, final int start, final int length) throws SAXException {
1177             //delegate to super class as long as no full word-source-data element has been parsed
1178             if (collator == null) {
1179                 super.characters(ch, start, length);
1180             }
1181  
1182             if (inPos) {
1183                 final String s = new String(ch, start, length);
1184                 currentPos = currentPos == null ? s : currentPos + s;
1185             } else if (inWord) {
1186                 final String s = new String(ch, start, length);
1187                 currentWord = currentWord == null ? s : currentWord + s;
1188             }
1189         }
1190  
1191         /**
1192          * This method is called by the parser when a document end element is encountered.
1193          *
1194          * @param uri the uri of the element encountered
1195          * @param localName the local name of the element encountered
1196          * @param qName the qualified name of the element encountered
1197          *
1198          * @throws SAXException in case some error occurs
1199          */
1200         public void endElement(final String uri, final String localName, final String qName) throws SAXException {
1201             // delegate to super class as long as no full word-source-data element has been parsed
1202             if (collator == null) {
1203                 super.endElement(uri, localName, qName);
1204             }
1205  
1206             if ("pos".equals(qName)) {
1207                 if (currentPos == null) {
1208                     throw new SAXException(
1209                         "The child text element of tag [pos] was expected to contain non-whitespace data.");
1210                 }
1211                 try {
1212                     currentIndices.add(Integer.valueOf(currentPos.trim()));
1213                 } catch (NumberFormatException e) {
1214                     throw new SAXException(
1215                         "The found [pos] tag value [" + currentPos + "] was not parseable into a valid int.", e);
1216                 }
1217                 inPos = false;
1218                 currentPos = null;
1219             } else if ("word".equals(qName)) {
1220                 if (currentWord == null) {
1221                     throw new SAXException(
1222                         "The child text element of tag [word] was expected to contain non-whitespace data.");
1223                 }
1224                 inWord = false;
1225             } else if ("word-index".equals(qName)) {
1226                 // full entry ended, put it to map.
1227                 words.put(collator.getCollationKey(currentWord), currentIndices);
1228                 currentWord = null;
1229             } else if ("word-source-data".equals(qName)) {
1230                 //super class has done its job, now we can create our collator.
1231                 try {
1232                     collator = Collator.getInstance(createWordSourceId().getSourceLocale());
1233                 } catch (IndexPersistenceException e) {
1234                     throw new SAXException("document index header contained unexpected data.", e);
1235                 }
1236             }
1237         }
1238  
1239         /**
1240          * This method creates a DocumentIndex instance from the data encountered during parsing.
1241          *
1242          * @return the instance created
1243          *
1244          * @throws IndexPersistenceException in case not enough data for creating the instance has been encountered
1245          * during parsing or the creation of the instance fails
1246          */
1247         private DocumentIndex createDocumentIndex() throws IndexPersistenceException {
1248             final WordSourceId sourceId = createWordSourceId();
1249             return new DocumentIndex(sourceId, words);
1250         }
1251     }
1252  
1253     /**
1254      * This is a default handler for parsing WordSourceIds.
1255      */
1256     private static class WordSourceIdHandler extends DefaultHandler {
1257         /**
1258          * This is a state flag which signals whether we are between the start and the end of an
1259          * <em>source-identity</em>. This information is used in {@link #characters(char[], int, int)} to determine
1260          * whether the characters are child element of that element or not.
1261          */
1262         private boolean inSourceIdentity = false;
1263         /**
1264          * This is a state flag which signals whether we are between the start and the end of an <em>class-name</em>.
1265          * This information is used in {@link #characters(char[], int, int)} to determine whether the characters are
1266          * child element of that element or not.
1267          */
1268         private boolean inClassName = false;
1269  
1270         /**
1271          * This is a state flag which signals whether we are between the start and the end of an
1272          * <em>word-source-data</em>. This information is used as an optimization that will disable a lot of unnecessary
1273          * comparison in case the parser is outside the parent element this instance is responsible for.
1274          */
1275         private boolean inScope = false;
1276  
1277         /**
1278          * This field contains the parsed value of the document index source identity.
1279          */
1280         private String sourceIdentityValue = null;
1281         /**
1282          * This field contains the parsed value of the document index identity class name.
1283          */
1284         private String classNameValue = null;
1285  
1286         /**
1287          * This field contains the parsed value of the document index identity locale.
1288          */
1289         private Locale locale = null;
1290  
1291         /**
1292          * This field contains the parsed value of the document index identity delimiters.
1293          */
1294         private List delimiters = new ArrayList();
1295  
1296         /**
1297          * This method is called by the parser when a document start element is encountered.
1298          *
1299          * @param uri the uri of the element encountered
1300          * @param localName the local name of the element encountered
1301          * @param qName the qualified name of the element encountered
1302          * @param attributes the attributes of the element encountered
1303          *
1304          * @throws SAXException in case some error occurs
1305          */
1306         public void startElement(final String uri, final String localName, final String qName,
1307                                  final Attributes attributes)
1308             throws SAXException {
1309             if (inScope) {
1310                 if ("source-identity".equals(qName)) {
1311                     inSourceIdentity = true;
1312                 } else if ("class-name".equals(qName)) {
1313                     inClassName = true;
1314                 } else if ("locale".equals(qName) && locale == null) {
1315                     final String language = getAttribute(attributes, "language", "locale");
1316                     final String country = getAttribute(attributes, "country", "locale");
1317                     final String variant = getAttribute(attributes, "variant", "locale");
1318                     locale = new Locale(language, country, variant);
1319                 } else if ("entry".equals(qName)) {
1320                     delimiters.add(getAttribute(attributes, "delimiter", "entry"));
1321                 }
1322             } else if ("word-source-data".equals(qName)) {
1323                 inScope = true;
1324             }
1325         }
1326  
1327         /**
1328          * This method is called by the parser when a TEXT element is encountered.
1329          *
1330          * @param ch the characters encountered
1331          * @param start the start index in ch
1332          * @param length the length in ch
1333          *
1334          * @throws SAXException in case some error occurs
1335          */
1336         public void characters(final char[] ch, final int start, final int length) throws SAXException {
1337             if (inScope) {
1338                 final String s = new String(ch, start, length);
1339                 if (inSourceIdentity) {
1340                     sourceIdentityValue = sourceIdentityValue == null ? s : sourceIdentityValue + s;
1341                 } else if (inClassName) {
1342                     classNameValue = classNameValue == null ? s : classNameValue + s;
1343                 }
1344             }
1345         }
1346  
1347         /**
1348          * This method is called by the parser when a document end element is encountered.
1349          *
1350          * @param uri the uri of the element encountered
1351          * @param localName the local name of the element encountered
1352          * @param qName the qualified name of the element encountered
1353          *
1354          * @throws SAXException in case some error occurs
1355          */
1356         public void endElement(final String uri, final String localName, final String qName) throws SAXException {
1357             if (inScope) {
1358                 if ("source-identity".equals(qName)) {
1359                     if (sourceIdentityValue == null) {
1360                         throw new SAXException(
1361                             "The child text element of tag [source-identity] was expected "
1362                                 + "to contain non-whitespace data.");
1363                     }
1364                     inSourceIdentity = false;
1365                 } else if ("class-name".equals(qName)) {
1366                     if (classNameValue == null) {
1367                         throw new SAXException(
1368                             "The child text element of tag [class-name] was expected to contain non-whitespace data.");
1369                     }
1370                     inClassName = false;
1371                 } else if ("delimiters".equals(qName)) {
1372                     if (delimiters.isEmpty()) {
1373                         throw new SAXException("At least one [entry] element is expected in the [delimiters] element.");
1374                     }
1375                 } else if ("word-source-data".equals(qName)) {
1376                     if (sourceIdentityValue == null) {
1377                         throw new SAXException(
1378                             "There was no [source-identity] element in the [word-source-data] element.");
1379                     }
1380                     if (classNameValue == null) {
1381                         throw new SAXException("There was no [class-name] element in the [word-source-data] element.");
1382                     }
1383                     if (locale == null) {
1384                         throw new SAXException("There was no [locale] element in the [word-source-data] element.");
1385                     }
1386                     inScope = false;
1387                 }
1388  
1389             }
1390         }
1391  
1392         /**
1393          * This method creates a WordSourceId from the data encountered during the parse process.
1394          *
1395          * @return the WordSourceId parsed
1396          *
1397          * @throws IndexPersistenceException in case not enough data for creating the instance has been encountered
1398          * during parsing or the creation of the instance fails
1399          */
1400         public WordSourceId createWordSourceId() throws IndexPersistenceException {
1401             if (sourceIdentityValue == null) {
1402                 throw new IndexPersistenceException("No [source-identity] value encountered during parsing.");
1403             }
1404             if (classNameValue == null) {
1405                 throw new IndexPersistenceException("No [class-name] value encountered during parsing.");
1406             }
1407             if (classNameValue.trim().length() == 0) {
1408                 throw new IndexPersistenceException("Empty [class-name] value encountered during parsing.");
1409             }
1410             if (locale == null) {
1411                 throw new IndexPersistenceException("No [locale] value encountered during parsing.");
1412             }
1413             if (delimiters.isEmpty()) {
1414                 throw new IndexPersistenceException("No [delimiters] value encountered during parsing.");
1415             }
1416  
1417             //decode the sourceIdentity object from the Base64 String
1418             final Base64Decoder base64Decoder = new Base64Decoder(Base64Codec.IGNORE_ALL, null,
1419                 Base64Codec.STANDARD_ALPHABET);
1420             final byte[] bytes = sourceIdentityValue.getBytes();
1421             final byte[] out = new byte[bytes.length];
1422             base64Decoder.setInput(bytes);
1423             int byteCount = 0;
1424             try {
1425                 byteCount = base64Decoder.inflate(out);
1426             } catch (DataFormatException e) {
1427                 throw new IndexPersistenceException(
1428                     "Error while decoding the Base64 data found in the [source-identity] tag.", e);
1429             }
1430             final byte[] serializedData = new byte[byteCount];
1431             System.arraycopy(out, 0, serializedData, 0, byteCount);
1432             final Object sourceIdentity = Utils.deSerializeObject(serializedData);
1433  
1434             // list to array
1435             final String[] delims = (String[]) delimiters.toArray(new String[delimiters.size()]);
1436  
1437             // build the actual WordSourceId
1438             return new WordSourceId(sourceIdentity, classNameValue, delims, locale);
1439         }
1440     }
1441  
1442     /**
1443      * This is a DefaultHandler for parsing CollectionIndex instances.
1444      */
1445     private static class DocumentCollectionHandler extends DefaultHandler {
1446         /**
1447          * This is a state flag which signals whether we are between the start and the end of an <em>document-id</em>.
1448          * This information is used in {@link #characters(char[], int, int)} to determine whether the characters are
1449          * child element of that element or not.
1450          */
1451         private boolean inDocumentId = false;
1452         /**
1453          * This field contains the parsed document ids.
1454          */
1455  
1456         private List documentIds = new ArrayList();
1457  
1458         /**
1459          * This field contains the currently parsed document, if any.
1460          */
1461         private String currentDocumentId = null;
1462  
1463         /**
1464          * This method is called by the parser when a document start element is encountered.
1465          *
1466          * @param uri the uri of the element encountered
1467          * @param localName the local name of the element encountered
1468          * @param qName the qualified name of the element encountered
1469          * @param attributes the attributes of the element encountered
1470          *
1471          * @throws SAXException in case some error occurs
1472          */
1473         public void startElement(final String uri, final String localName, final String qName,
1474                                  final Attributes attributes)
1475             throws SAXException {
1476             if ("document-id".equals(qName)) {
1477                 inDocumentId = true;
1478             }
1479         }
1480  
1481         /**
1482          * This method is called by the parser when a TEXT element is encountered.
1483          *
1484          * @param ch the characters encountered
1485          * @param start the start index in ch
1486          * @param length the length in ch
1487          *
1488          * @throws SAXException in case some error occurs
1489          */
1490         public void characters(final char[] ch, final int start, final int length) throws SAXException {
1491             if (inDocumentId) {
1492                 final String s = new String(ch, start, length);
1493                 currentDocumentId = currentDocumentId == null ? s : currentDocumentId + s;
1494             }
1495         }
1496  
1497         /**
1498          * This method is called by the parser when a document end element is encountered.
1499          *
1500          * @param uri the uri of the element encountered
1501          * @param localName the local name of the element encountered
1502          * @param qName the qualified name of the element encountered
1503          *
1504          * @throws SAXException in case some error occurs
1505          */
1506         public void endElement(final String uri, final String localName, final String qName) throws SAXException {
1507             if ("document-id".equals(qName)) {
1508                 if (currentDocumentId == null) {
1509                     throw new SAXException(
1510                         "The child text element of tag [document-id] was expected to contain non-whitespace data.");
1511                 }
1512                 documentIds.add(currentDocumentId.trim());
1513                 inDocumentId = false;
1514                 currentDocumentId = null;
1515             }
1516         }
1517  
1518         /**
1519          * This method returns all document ids found during parsing the document collection.
1520          *
1521          * @return the document ids found during parsing
1522          */
1523         private String[] getDocumentIds() {
1524             return (String[]) documentIds.toArray(new String[documentIds.size()]);
1525         }
1526     }
1527  
1528     /**
1529      * This is the DefaultHandler used for parsing the document_usage.xml.
1530      */
1531     private static class DocumentUsageHandler extends DefaultHandler {
1532         /**
1533          * This field contains the parsed usages for documents.
1534          */
1535  
1536         private Map usages = new HashMap();
1537  
1538         /**
1539          * This field contains the currently parsed document, if any.
1540          */
1541         private DocumentUsage currentDocumentUsage = null;
1542  
1543         /**
1544          * This method is called by the parser when a document start element is encountered.
1545          *
1546          * @param uri the uri of the element encountered
1547          * @param localName the local name of the element encountered
1548          * @param qName the qualified name of the element encountered
1549          * @param attributes the attributes of the element encountered
1550          *
1551          * @throws SAXException in case some error occurs
1552          */
1553         public void startElement(final String uri, final String localName, final String qName,
1554                                  final Attributes attributes)
1555             throws SAXException {
1556             if ("document-usage".equals(qName)) {
1557                 final String documentId = getAttribute(attributes, "document-id", "document-usage");
1558                 final String countStr = getAttribute(attributes, "usage-count", "document-usage");
1559                 final int count;
1560                 try {
1561                     count = Integer.parseInt(countStr.trim());
1562                 } catch (NumberFormatException e) {
1563                     throw new SAXException("The found [count] tag attribute value ["
1564                         + countStr + "] was not parseable into a valid int.", e);
1565                 }
1566                 currentDocumentUsage = new DocumentUsage(documentId, count);
1567             } else if ("collection".equals(qName)) {
1568                 if (currentDocumentUsage == null) {
1569                     throw new SAXException("Illegal structure encountered, expected a [document-usage] "
1570                         + "tag to exist as parent of a [collection] tag.");
1571                 }
1572                 currentDocumentUsage.add(getAttribute(attributes, "collection-id", "collection"));
1573             }
1574         }
1575  
1576         /**
1577          * This method is called by the parser when a document end element is encountered.
1578          *
1579          * @param uri the uri of the element encountered
1580          * @param localName the local name of the element encountered
1581          * @param qName the qualified name of the element encountered
1582          *
1583          * @throws SAXException in case some error occurs
1584          */
1585         public void endElement(final String uri, final String localName, final String qName) throws SAXException {
1586             if ("document-usage".equals(qName)) {
1587                 usages.put(currentDocumentUsage.getDocumentId(), currentDocumentUsage);
1588                 currentDocumentUsage = null;
1589             }
1590         }
1591  
1592         /**
1593          * This method does return the document usages parsed from the file.
1594          *
1595          * @return the document usages parsed from the file as map from String to {@link DocumentUsage}
1596          */
1597         private Map getDocumentUsages() {
1598             return usages;
1599         }
1600     }
1601  
1602     /**
1603      * This class represents the document usage elements in document_usage.xml.
1604      */
1605     private static final class DocumentUsage {
1606         /**
1607          * This is the document id of the usage information.
1608          */
1609         private final String documentId;
1610         /**
1611          * This is the use count of the usage information.
1612          */
1613         private int usageCount;
1614         /**
1615          * This is the collection of document index collections the document is contained in.
1616          */
1617         private Set collectionIds = new HashSet();
1618  
1619         /**
1620          * Creates a new DocumentUsage from the given arguments.
1621          *
1622          * @param documentId the document id for the instance
1623          * @param usageCount the usage count for this instance
1624          *
1625          * @throws IllegalArgumentException in case the document id is <tt>null</tt> or empty String
1626          */
1627         private DocumentUsage(final String documentId, final int usageCount) {
1628             if (documentId == null) {
1629                 throw new IllegalArgumentException("The parameter named [documentId] was null.");
1630             }
1631             if (documentId.trim().length() == 0) {
1632                 throw new IllegalArgumentException("The parameter named [documentId] was an empty String.");
1633             }
1634  
1635             this.documentId = documentId;
1636             this.usageCount = usageCount;
1637         }
1638  
1639         /**
1640          * Creates a new DocumentUsage from the given argument. The use count is initialized to 0.
1641          *
1642          * @param documentId the document id for the instance
1643          *
1644          * @throws IllegalArgumentException in case the document id is <tt>null</tt> or empty String
1645          */
1646         private DocumentUsage(final String documentId) {
1647             this.documentId = documentId;
1648             usageCount = 0;
1649         }
1650  
1651         /**
1652          * This method adds the given collection id to the document usage.
1653          *
1654          * @param collectionId the collection id to be removed
1655          */
1656         private void add(final String collectionId) {
1657             collectionIds.add(collectionId);
1658         }
1659  
1660         /**
1661          * This method removes the given collection id from the document usage.
1662          *
1663          * @param collectionId the collection id to be removed
1664          */
1665         private void remove(final String collectionId) {
1666             collectionIds.remove(collectionId);
1667         }
1668  
1669         /**
1670          * This method returns the collection ids contained in this document usage.
1671          *
1672          * @return the collection ids contained in this document usage
1673          */
1674         private String[] getCollectionIds() {
1675             return (String[]) collectionIds.toArray(new String[collectionIds.size()]);
1676         }
1677  
1678         /**
1679          * This method returns the document id of this document usage.
1680          *
1681          * @return the document id of this document usage
1682          */
1683         private String getDocumentId() {
1684             return documentId;
1685         }
1686  
1687         /**
1688          * This method returns the use count of this document usage.
1689          *
1690          * @return the use count of this document usage
1691          */
1692         private int getUsageCount() {
1693             return usageCount;
1694         }
1695  
1696         /**
1697          * This method increments the usage count of this document usage.
1698          */
1699         private void incUsageCount() {
1700             usageCount++;
1701         }
1702  
1703         /**
1704          * This method decrements the usage count of this document usage.
1705          */
1706         private void decUsageCount() {
1707             usageCount--;
1708         }
1709     }
1710 }

this report was generated by version 1.0.5 of jcoverage.
visit www.jcoverage.com for updates.

copyright © 2003, jcoverage ltd. all rights reserved.
Java is a trademark of Sun Microsystems, Inc. in the United States and other countries.