Line | Hits | Source |
---|---|---|
1 | /* | |
2 | * Copyright (C) 2006 TopCoder Inc., All Rights Reserved. | |
3 | */ | |
4 | package com.topcoder.document.index.persistence.impl.xml; | |
5 | ||
6 | import com.topcoder.document.index.CollectionIndex; | |
7 | import com.topcoder.document.index.DocumentIndex; | |
8 | import com.topcoder.document.index.persistence.IndexPersistence; | |
9 | import com.topcoder.document.index.persistence.IndexPersistenceException; | |
10 | import com.topcoder.document.index.persistence.impl.PersistenceConfigurationException; | |
11 | import com.topcoder.document.index.persistence.impl.Utils; | |
12 | import com.topcoder.document.index.wordsource.WordSourceId; | |
13 | import com.topcoder.util.compression.Base64Codec; | |
14 | import com.topcoder.util.compression.Base64Decoder; | |
15 | import com.topcoder.util.compression.Base64Encoder; | |
16 | import org.xml.sax.Attributes; | |
17 | import org.xml.sax.SAXException; | |
18 | import org.xml.sax.helpers.DefaultHandler; | |
19 | ||
20 | import javax.xml.parsers.ParserConfigurationException; | |
21 | import javax.xml.parsers.SAXParser; | |
22 | import javax.xml.parsers.SAXParserFactory; | |
23 | import java.io.BufferedOutputStream; | |
24 | import java.io.File; | |
25 | import java.io.FileFilter; | |
26 | import java.io.FileOutputStream; | |
27 | import java.io.IOException; | |
28 | import java.io.OutputStream; | |
29 | import java.text.CollationKey; | |
30 | import java.text.Collator; | |
31 | import java.text.MessageFormat; | |
32 | import java.text.StringCharacterIterator; | |
33 | import java.util.ArrayList; | |
34 | import java.util.Collection; | |
35 | import java.util.HashMap; | |
36 | import java.util.HashSet; | |
37 | import java.util.Iterator; | |
38 | import java.util.List; | |
39 | import java.util.Locale; | |
40 | import java.util.Map; | |
41 | import java.util.Set; | |
42 | import java.util.zip.DataFormatException; | |
43 | ||
44 | ||
45 | /** | |
46 | * This is a concrete implementation of the IndexPersistence contract that uses XML files as storage medium. | |
47 | * <p/> | |
48 | * This implementation allows for creation of arbitrarily large document index files by sequencing (splitting) large | |
49 | * files into chunks of file based data. Since it is required that binary data is stored in the XML files for document | |
50 | * index a serialized Base64 encoded data will be both encoded and decoded by this implementation. | |
51 | * <p/> | |
52 | * <b>Thread safety:</b>This class is no designed to be thread-safe. | |
53 | * | |
54 | * @author AleaActaEst, TCSDEVELOPER | |
55 | * @version 1.0 | |
56 | */ | |
57 | 243 | public class XmlIndexPersistence implements IndexPersistence { |
58 | ||
59 | /** | |
60 | * This constant is the file name used to represent a document usage as defined by the persistence contract. There | |
61 | * is only one such file per XML persistence name space. | |
62 | */ | |
63 | private static final String DOC_USAGE_FILE_NAME = "document_usage.xml"; | |
64 | ||
65 | /** | |
66 | * This is the file name prefix for all files that represent a Collection Index XML file storage (i.e. a document | |
67 | * collection file) Each collection XML file name will have the following format: | |
68 | * <tt>doc_collection_collectionId.xml</tt>. | |
69 | * <p/> | |
70 | * For example here is a simple document collection XML file for a collection with id of <tt>175658</tt>: | |
71 | * <tt>doc_collection_175658.xml</tt>. | |
72 | */ | |
73 | private static final String DOC_COLLECTION_FILE_NAME_PREFIX = "doc_collection_"; | |
74 | ||
75 | /** | |
76 | * This is the file name prefix for all files that represent a Document Index XML file storage (i.e. a document | |
77 | * index file) Each document index file name will have the following format:<tt>doc_index_docId_sequenceid.xml</tt>. | |
78 | * <p/> | |
79 | * Here is an example of a document index XML file for a document with id of <tt>16527</tt>: | |
80 | * <tt>doc_index_16527_001.xml</tt>. | |
81 | * <p/> | |
82 | * The <tt>sequenceid</tt> is used to ‘link’ files with data that is larger than a configured limit of e.g. 2GB | |
83 | * (this can be configured) this means that if there is indexed data that is larger than say 2GB the file is split | |
84 | * into 2 files with the first having the sequence id of 001 and the second one being obviously 002. This gives the | |
85 | * flexibility for persistence of very large content. | |
86 | */ | |
87 | private static final String DOC_INDEX_FILE_NAME_PREFIX = "doc_index_"; | |
88 | /** | |
89 | * This is the {@link com.topcoder.util.config.ConfigManager} lookup key used for reading the {@link | |
90 | * #persistenceDirPath} from the ConfigManager. | |
91 | */ | |
92 | private static final String XML_PERSISTENCE_PATH_PROPERTY_NAME = "XmlPersistencePath"; | |
93 | /** | |
94 | * This is the {@link com.topcoder.util.config.ConfigManager} lookup key used for reading the {@link | |
95 | * #fileSizeThreshold} from the ConfigManager. | |
96 | */ | |
97 | private static final String FILE_SIZE_THRESHOLD_PROPERTY_NAME = "FileSizeThreshold"; | |
98 | /** | |
99 | * This is the {@link com.topcoder.util.config.ConfigManager} lookup key used for reading the {@link #fileSizeLimit} | |
100 | * from the ConfigManager. | |
101 | */ | |
102 | private static final String FILE_SIZE_LIMIT_PROPERTY_NAME = "FileSizeLimit"; | |
103 | /** | |
104 | * This constant defines the charset used in XML encoding. | |
105 | */ | |
106 | private static final String UTF_8_CHARSET = "UTF-8"; | |
107 | /** | |
108 | * This constant defines the factor used in converting kilobyte values into byte-values and megabyte values into | |
109 | * kilobytes. | |
110 | */ | |
111 | private static final int KILO_FACTOR = 1024; | |
112 | ||
113 | /** | |
114 | * This is a file size limit for saving a chunk of indexed data. It is measured in Mbytes. Consider data so large | |
115 | * that it cannot fit into a single file due to size limitations for the specific OS or file system. This size limit | |
116 | * is used to decide when to start chaining multiple files to store the data. It is initialized from configuration | |
117 | * through the constructor and once initialized cannot be changed. Must be a positive integer (i.e. | |
118 | * <tt>>0</tt>). | |
119 | */ | |
120 | private final int fileSizeLimit; | |
121 | ||
122 | /** | |
123 | * This is a safety net size in Kbytes that is applied before the fileSizeLimit is reached. This is basically | |
124 | * something of a round off error precaution. It is initialized from configuration through the constructor and once | |
125 | * initialized cannot be changed. Must be a non-negative integer (i.e. <tt>>= 0</tt>). | |
126 | */ | |
127 | private final int fileSizeThreshold; | |
128 | ||
129 | /** | |
130 | * This is the file path to a directory where all the XML files that make up this persistence session are stored (it | |
131 | * is almost like a data base namespace). This must be a valid fully qualified path. It is initialized from | |
132 | * configuration through the constructor and once initialized cannot be changed. Cannot be <tt>null</tt> or an empty | |
133 | * string. | |
134 | */ | |
135 | private final String persistenceDirPath; | |
136 | ||
137 | /** | |
138 | * This is a simple constructor which will populate the fileSizeLimit, fileSizeThreshold, and persistenceDirPath | |
139 | * information from configuration by looking up the values from the given {@link | |
140 | * com.topcoder.util.config.ConfigManager} namespace. | |
141 | * | |
142 | * @param namespace the configuration namespace to be used for looking up the values from the {@link | |
143 | * com.topcoder.util.config.ConfigManager} | |
144 | * | |
145 | * @throws PersistenceConfigurationException | |
146 | * if there are configuration issues encountered, e.g. missing or invalid values | |
147 | * @throws IllegalArgumentException if namespace param is <tt>null</tt> or an empty (trim'd) String | |
148 | */ | |
149 | 84 | public XmlIndexPersistence(final String namespace) throws PersistenceConfigurationException { |
150 | 84 | if (namespace == null) { |
151 | 1 | throw new IllegalArgumentException("The parameter named [namespace] was null."); |
152 | } | |
153 | 83 | if (namespace.trim().length() == 0) { |
154 | 1 | throw new IllegalArgumentException("The parameter named [namespace] was an empty String."); |
155 | } | |
156 | 82 | fileSizeLimit = Utils.lookupIntFromConfigManager(namespace, FILE_SIZE_LIMIT_PROPERTY_NAME, 1) * KILO_FACTOR |
157 | * KILO_FACTOR; | |
158 | 77 | fileSizeThreshold = Utils.lookupIntFromConfigManager(namespace, FILE_SIZE_THRESHOLD_PROPERTY_NAME, 0) |
159 | * KILO_FACTOR; | |
160 | try { | |
161 | 73 | persistenceDirPath = checkPath( |
162 | Utils.lookupValidStringFromConfigManager(namespace, XML_PERSISTENCE_PATH_PROPERTY_NAME)); | |
163 | 2 | } catch (IllegalArgumentException e) { |
164 | 2 | throw new PersistenceConfigurationException("The configured persistence directory was not usable.", e); |
165 | 69 | } |
166 | //check whether the directory contains a valid document-usages file | |
167 | try { | |
168 | 69 | readDocumentUsages(); |
169 | 1 | } catch (IndexPersistenceException e) { |
170 | 1 | throw new PersistenceConfigurationException("The persistence directory configured [" + persistenceDirPath |
171 | + "] did not contain a valid document_usages.xml.", e); | |
172 | 68 | } |
173 | 68 | } |
174 | ||
175 | /** | |
176 | * This is a simple convenience constructor which will populate the fileSizeLimit, fileSizeThreshold, and | |
177 | * persistenceDirPath information directly. The fileSizeLimit and fileSizeThreshold values should be in a meaningful | |
178 | * relation to each other, otherwise it could happen that every word is persisted to a separate file (when | |
179 | * <tt>fileSizeLimit * 1024 <= fileSizeThreshold</tt> ). | |
180 | * | |
181 | * @param fileSizeLimit This is a file size limit in Mbytes for saving a chunk of indexed data, must be | |
182 | * <tt>>0</tt> | |
183 | * @param fileSizeThreshold This is a safety net size in Kbytes that is applied before the fileSizeLimit is | |
184 | * reached, must be <tt>>=0</tt> | |
185 | * @param persistenceDirPath This is the file path to a directory where all the XML files that make up this | |
186 | * persistence session are stored | |
187 | * | |
188 | * @throws IllegalArgumentException in case fileSizeLimit is <1 or fileSizeThreshold is <0 or the given | |
189 | * persistenceDirPath does not denote an existent absolute path of a directory that | |
190 | * is writable and contains a valid document_usage.xml file | |
191 | */ | |
192 | 9 | public XmlIndexPersistence(final int fileSizeLimit, final int fileSizeThreshold, final String persistenceDirPath) { |
193 | 9 | if (fileSizeLimit < 1) { |
194 | 1 | throw new IllegalArgumentException( |
195 | "The parameter named [fileSizeLimit] was expected to be >0 , but was [" + fileSizeLimit + "]."); | |
196 | } | |
197 | 8 | if (fileSizeThreshold < 0) { |
198 | 1 | throw new IllegalArgumentException( |
199 | "The parameter named [fileSizeThreshold] was expected to be >=0 , but was [" + fileSizeThreshold | |
200 | + "]."); | |
201 | } | |
202 | 7 | this.fileSizeLimit = fileSizeLimit * KILO_FACTOR * KILO_FACTOR; |
203 | 7 | this.fileSizeThreshold = fileSizeThreshold * KILO_FACTOR; |
204 | 7 | this.persistenceDirPath = checkPath(persistenceDirPath); |
205 | //check whether the directory contains a valid document-usages file | |
206 | try { | |
207 | 3 | readDocumentUsages(); |
208 | 1 | } catch (IndexPersistenceException e) { |
209 | 1 | final IllegalArgumentException illegalArgumentException = new IllegalArgumentException( |
210 | "The persistence directory configured [" + persistenceDirPath | |
211 | + "] did not contain a valid document_usages.xml."); | |
212 | 1 | illegalArgumentException.initCause(e); |
213 | 1 | throw illegalArgumentException; |
214 | 2 | } |
215 | 2 | } |
216 | ||
217 | /** | |
218 | * This method adds the given DocumentIndex to the persistence. | |
219 | * | |
220 | * @param documentIndex WordSource representing the document to be indexed, should not be <tt>null</tt> | |
221 | * | |
222 | * @throws IllegalArgumentException when documentIndex is <tt>null</tt> | |
223 | * @throws IndexPersistenceException when the method fails to add DocumentIndex to the persistence | |
224 | */ | |
225 | public void addDocumentIndex(final DocumentIndex documentIndex) throws IndexPersistenceException { | |
226 | 25 | if (documentIndex == null) { |
227 | 1 | throw new IllegalArgumentException("The parameter named [documentIndex] was null."); |
228 | } | |
229 | ||
230 | 24 | final String documentID = Utils.createIdString(documentIndex.getWordSourceId()); |
231 | ||
232 | 24 | final File firstFile = new File(persistenceDirPath, DOC_INDEX_FILE_NAME_PREFIX + documentID + "_" |
233 | + 1 + ".xml"); | |
234 | 24 | if (firstFile.exists()) { |
235 | 1 | throw new IndexPersistenceException("The given index does already exist in the persistent storage as file [" |
236 | + firstFile.getName() + "]."); | |
237 | } | |
238 | ||
239 | 23 | final Locale locale = documentIndex.getLocale(); |
240 | 23 | final WordSourceId sourceId = documentIndex.getWordSourceId(); |
241 | ||
242 | // serialize the source identity and encode it into a Base64 String | |
243 | 23 | final byte[] input = Utils.serializeObject(sourceId.getSourceIdentity()); |
244 | 23 | final byte[] bytes = new byte[input.length * 2]; |
245 | 23 | final Base64Encoder base64Encoder = new Base64Encoder(0, null, Base64Codec.STANDARD_ALPHABET); |
246 | 23 | base64Encoder.setInput(input); |
247 | 23 | base64Encoder.finish(); |
248 | 23 | final int size = base64Encoder.deflate(bytes); |
249 | 23 | final String sourceIdentityString = new String(bytes, 0, size); |
250 | ||
251 | //build the delimiters list | |
252 | 23 | final String[] delimiters = sourceId.getDelimiters(); |
253 | 23 | final StringBuffer temp = new StringBuffer(); |
254 | 97 | for (int i = 0; i < delimiters.length; i++) { |
255 | 74 | temp.append("<entry delimiter=\""); |
256 | 74 | temp.append(escapeForXML(delimiters[i])); |
257 | 74 | temp.append("\"/>"); |
258 | } | |
259 | 23 | final String delimiterEntries = temp.toString(); |
260 | ||
261 | // this has not been extracted to be a string constant | |
262 | // as the readability and maintainability would be | |
263 | // decreased when separating the format message with | |
264 | // its placeholders from the actual values used | |
265 | // for the placeholders | |
266 | 23 | final String header = MessageFormat.format( |
267 | "<?xml version=\"1.0\" encoding=\"" + UTF_8_CHARSET + "\" standalone=\"yes\"?>" | |
268 | + "<doc lang=\"{0}-{1}\">" | |
269 | + "<document-index document-id=\"{2}\">" | |
270 | + "<word-source-data>" | |
271 | + "<source-identity>{3}</source-identity>" | |
272 | + "<class-name>{4}</class-name>" | |
273 | + "<locale language=\"{0}\" country=\"{1}\" variant=\"{5}\"/>" | |
274 | + "<delimiters>{6}</delimiters>" | |
275 | + "</word-source-data>", | |
276 | new Object[]{ | |
277 | escapeForXML(locale.getLanguage()), | |
278 | escapeForXML(locale.getCountry()), | |
279 | escapeForXML(documentID), | |
280 | sourceIdentityString, // unescaped as Base64 is ASCII only | |
281 | escapeForXML(sourceId.getSourceClassName()), | |
282 | escapeForXML(locale.getVariant()), | |
283 | delimiterEntries // unescaped as already escaped during construction | |
284 | } | |
285 | ); | |
286 | ||
287 | 23 | final String footer = "</document-index></doc>"; |
288 | ||
289 | 23 | writeDocumentDataFiles(header, footer, documentID, documentIndex); |
290 | ||
291 | //insert the entry representing the document into document_usages.xml | |
292 | 23 | final Map map = readDocumentUsages(); |
293 | 23 | map.put(documentID, new DocumentUsage(documentID)); |
294 | 23 | writeDocumentUsages(map); |
295 | 23 | } |
296 | ||
297 | /** | |
298 | * This method retrieves a DocumentIndex with the specified WordSourceId from the persistence. It does return | |
299 | * <tt>null</tt>, if DocumentIndex with the given WordSourceId is not found in the persistence. | |
300 | * <p/> | |
301 | * CS section 1.4.4.1 describes the algorithm of this method. | |
302 | * | |
303 | * @param wordSourceId WordSourceId of document to retrieve | |
304 | * | |
305 | * @return the retrieved DocumentIndex, or <tt>null</tt> if document index with given WordSourceId is not found in | |
306 | * the persistence | |
307 | * | |
308 | * @throws IllegalArgumentException when wordSourceId is <tt>null</tt> | |
309 | * @throws IndexPersistenceException when the implementation fails to retrieve document index with the specified | |
310 | * WordSourceId | |
311 | */ | |
312 | public DocumentIndex getDocumentIndex(final WordSourceId wordSourceId) throws IndexPersistenceException { | |
313 | 11 | if (wordSourceId == null) { |
314 | 1 | throw new IllegalArgumentException("The parameter named [wordSourceId] was null."); |
315 | } | |
316 | ||
317 | 10 | final String documentId = Utils.createIdString(wordSourceId); |
318 | ||
319 | 10 | return getDocumentIndex(documentId); |
320 | } | |
321 | ||
322 | /** | |
323 | * This method removes a document index with given WordSourceId from the persistence. When the document index is not | |
324 | * found, or function fail to remove it, an exception is thrown. Function does also not succeed when document index | |
325 | * use count is not zero. | |
326 | * <p/> | |
327 | * CS section 1.4.4.2 describes the algorithm of this method. | |
328 | * | |
329 | * @param wordSourceId WordSourceId of document index to remove | |
330 | * | |
331 | * @throws IllegalArgumentException when WordSourceId is <tt>null</tt> | |
332 | * @throws IndexPersistenceException when document index is not found in the persistence, its use count is not zero, | |
333 | * or error happens when trying to remove it | |
334 | */ | |
335 | public void removeDocumentIndex(final WordSourceId wordSourceId) throws IndexPersistenceException { | |
336 | 5 | if (wordSourceId == null) { |
337 | 1 | throw new IllegalArgumentException("The parameter named [wordSourceId] was null."); |
338 | } | |
339 | ||
340 | 4 | final int count = getDocumentUseCount(wordSourceId); |
341 | 3 | if (count != 0) { |
342 | 1 | throw new IndexPersistenceException("The index with the given id [" + wordSourceId |
343 | + "] cannot be deleted as its use count is not 0, but is " + count + "."); | |
344 | } | |
345 | ||
346 | 2 | final String documentId = Utils.createIdString(wordSourceId); |
347 | 2 | final File[] indexFiles = new File(persistenceDirPath).listFiles( |
348 | new PrefixSuffixFilter(DOC_INDEX_FILE_NAME_PREFIX + documentId, ".xml")); | |
349 | 4 | for (int i = 0; i < indexFiles.length; i++) { |
350 | 2 | final File indexFile = indexFiles[i]; |
351 | 2 | indexFile.delete(); |
352 | } | |
353 | ||
354 | // update document_usages.xml | |
355 | // (remove element representing document) | |
356 | 2 | final Map map = readDocumentUsages(); |
357 | 2 | map.remove(documentId); |
358 | 2 | writeDocumentUsages(map); |
359 | 2 | } |
360 | ||
361 | /** | |
362 | * This method stores the given document collection index in the persistence. | |
363 | * <p/> | |
364 | * CS section 1.4.4.4 describes the algorithm of this method. | |
365 | * | |
366 | * @param collectionIndex CollectionIndex to store | |
367 | * | |
368 | * @throws IllegalArgumentException if collectionIndex is <tt>null</tt> | |
369 | * @throws IndexPersistenceException if fails to create and store document collection index or the given index | |
370 | * already exists in persistence or contains unpersisted documents | |
371 | */ | |
372 | public void addCollectionIndex(final CollectionIndex collectionIndex) | |
373 | throws IndexPersistenceException { | |
374 | 11 | if (collectionIndex == null) { |
375 | 1 | throw new IllegalArgumentException("The parameter named [collectionIndex] was null."); |
376 | } | |
377 | ||
378 | 10 | final String collectionId = collectionIndex.getId(); |
379 | 10 | final File persistenceFile = |
380 | new File(persistenceDirPath, DOC_COLLECTION_FILE_NAME_PREFIX + collectionId + "_" + ".xml"); | |
381 | ||
382 | 10 | if (persistenceFile.exists()) { |
383 | 1 | throw new IndexPersistenceException("The given Collection index does already exist in persistence as file [" |
384 | + persistenceFile.getName() + "]."); | |
385 | } | |
386 | ||
387 | //build the documents list | |
388 | 9 | final Set allDocumentIds = collectionIndex.getAllDocumentIds(); |
389 | // this is a set of the string form of the document ids, | |
390 | // re-used when updating the usage information | |
391 | 9 | final Set documentIdStrings = new HashSet(); |
392 | 9 | final StringBuffer temp = new StringBuffer(); |
393 | 9 | for (Iterator iterator = allDocumentIds.iterator(); iterator.hasNext();) { |
394 | 11 | final WordSourceId sourceId = (WordSourceId) iterator.next(); |
395 | 11 | final String documentId = Utils.createIdString(sourceId); |
396 | ||
397 | //check whether document has been persisted | |
398 | 11 | final String expectedFileName = |
399 | DOC_INDEX_FILE_NAME_PREFIX + documentId + "_" + 1 + ".xml"; | |
400 | 11 | if (!new File(persistenceDirPath, expectedFileName).exists()) { |
401 | 1 | throw new IndexPersistenceException( |
402 | "Cannot add collection when its contained documents are not yet persisted (document with id [" | |
403 | + sourceId + "] was not persisted)."); | |
404 | } | |
405 | ||
406 | 10 | documentIdStrings.add(documentId); |
407 | 10 | temp.append("<document-id>"); |
408 | 10 | temp.append(escapeForXML(documentId)); |
409 | 10 | temp.append("</document-id>"); |
410 | } | |
411 | 8 | final String documentList = temp.toString(); |
412 | ||
413 | // this has not been extracted to be a string constant | |
414 | // as the readability and maintainability would be | |
415 | // decreased when separating the format message with | |
416 | // its placeholders from the actual values used | |
417 | // for the placeholders | |
418 | 8 | final String fileContent = MessageFormat.format( |
419 | "<?xml version=\"1.0\" encoding=\"" + UTF_8_CHARSET + "\" standalone=\"yes\"?>" | |
420 | + "<document-collection collection-id=\"{0}\">" | |
421 | + "<documents>{1}</documents>" | |
422 | + "</document-collection>", | |
423 | new Object[]{ | |
424 | escapeForXML(collectionId), | |
425 | documentList // already escaped | |
426 | } | |
427 | ); | |
428 | ||
429 | //write out the data to the file | |
430 | 8 | writeUTF8File(fileContent, persistenceFile); |
431 | ||
432 | 8 | if (!documentIdStrings.isEmpty()) { |
433 | //update document usages | |
434 | 8 | final Map documentUsages = readDocumentUsages(); |
435 | 8 | for (Iterator iterator = documentIdStrings.iterator(); iterator.hasNext();) { |
436 | 10 | final String documentId = (String) iterator.next(); |
437 | 10 | DocumentUsage usage = (DocumentUsage) documentUsages.get(documentId); |
438 | 10 | if (usage == null) { |
439 | 0 | usage = new DocumentUsage(documentId); |
440 | 0 | documentUsages.put(documentId, usage); |
441 | } | |
442 | 10 | usage.add(collectionId); |
443 | } | |
444 | 8 | writeDocumentUsages(documentUsages); |
445 | } | |
446 | 8 | } |
447 | ||
448 | /** | |
449 | * This method retrieves a document collection index with specified identifier. Id does return <tt>null</tt> when | |
450 | * collection with specified identifier is not found | |
451 | * <p/> | |
452 | * CS section 1.4.4.5 describes the algorithm of this method. | |
453 | * | |
454 | * @param collectionId identifier of document collection index to retrieve | |
455 | * | |
456 | * @return CollectionIndex with specified identifier or <tt>null</tt> if the collection index does not exist in | |
457 | * persistence | |
458 | * | |
459 | * @throws IllegalArgumentException when collectionId is <tt>null</tt> or empty string | |
460 | * @throws IndexPersistenceException when collection index can not be retrieved | |
461 | */ | |
462 | public CollectionIndex getCollectionIndex(final String collectionId) throws IndexPersistenceException { | |
463 | 10 | if (collectionId == null) { |
464 | 1 | throw new IllegalArgumentException("The parameter named [wordSourceId] was null."); |
465 | } | |
466 | 9 | if (collectionId.trim().length() == 0) { |
467 | 1 | throw new IllegalArgumentException("The parameter named [collectionId] was an empty String."); |
468 | } | |
469 | ||
470 | 8 | final File persistenceFile = |
471 | new File(persistenceDirPath, DOC_COLLECTION_FILE_NAME_PREFIX + collectionId + "_" + ".xml"); | |
472 | ||
473 | 8 | if (!persistenceFile.exists()) { |
474 | 2 | return null; |
475 | } | |
476 | ||
477 | 6 | final DocumentCollectionHandler dh = new DocumentCollectionHandler(); |
478 | 6 | saxParse(persistenceFile, dh); |
479 | 6 | final String[] documentIds = dh.getDocumentIds(); |
480 | ||
481 | 6 | final Map wordsOfCollection = new HashMap(); |
482 | 6 | final Set documentSourceIds = new HashSet(); |
483 | ||
484 | 14 | for (int i = 0; i < documentIds.length; i++) { |
485 | 8 | final String documentId = documentIds[i]; |
486 | 8 | final DocumentIndex index = getDocumentIndex(documentId); |
487 | 8 | final WordSourceId sourceId = index.getWordSourceId(); |
488 | ||
489 | // add all words of the index to the collection's word-index | |
490 | // map and add a reference to the current index | |
491 | 8 | final Set wordsOfIndex = index.getWords().keySet(); |
492 | 8 | for (Iterator iterator = wordsOfIndex.iterator(); iterator.hasNext();) { |
493 | 24 | final CollationKey collationKey = (CollationKey) iterator.next(); |
494 | 24 | Set documentsCotainingWord = (Set) wordsOfCollection.get(collationKey); |
495 | 24 | if (documentsCotainingWord == null) { |
496 | 18 | documentsCotainingWord = new HashSet(); |
497 | 18 | wordsOfCollection.put(collationKey, documentsCotainingWord); |
498 | } | |
499 | 24 | documentsCotainingWord.add(sourceId); |
500 | } | |
501 | // add document to collection's contained documents set | |
502 | 8 | documentSourceIds.add(sourceId); |
503 | } | |
504 | // build the collection object and return it | |
505 | 6 | return new CollectionIndex(null, wordsOfCollection, documentSourceIds, collectionId); |
506 | } | |
507 | ||
508 | /** | |
509 | * This method does remove document collection index with specified identifier from the persistence. | |
510 | * <p/> | |
511 | * CS section 1.4.4.7 describes the algorithm of this method. | |
512 | * | |
513 | * @param collectionId identifier of document collection index to remove | |
514 | * | |
515 | * @throws IllegalArgumentException when collectionId is <tt>null</tt> or empty string | |
516 | * @throws IndexPersistenceException when fails to remove CollectionIndex with given identifier or the collection | |
517 | * does not exist in persistence | |
518 | */ | |
519 | public void removeCollectionIndex(final String collectionId) throws IndexPersistenceException { | |
520 | 7 | if (collectionId == null) { |
521 | 1 | throw new IllegalArgumentException("The parameter named [collectionId] was null."); |
522 | } | |
523 | 6 | if (collectionId.trim().length() == 0) { |
524 | 1 | throw new IllegalArgumentException("The parameter named [collectionId] was an empty String."); |
525 | } | |
526 | 5 | final File persistenceFile = |
527 | new File(persistenceDirPath, DOC_COLLECTION_FILE_NAME_PREFIX + collectionId + "_" + ".xml"); | |
528 | ||
529 | 5 | if (!persistenceFile.exists()) { |
530 | 2 | throw new IndexPersistenceException( |
531 | "The Collection index with the given id [" + collectionId + "] does not exist in persistence as file [" | |
532 | + persistenceFile.getName() + "]."); | |
533 | } | |
534 | //delete the index file | |
535 | 3 | persistenceFile.delete(); |
536 | ||
537 | //update document usages | |
538 | 3 | final Map documentUsages = readDocumentUsages(); |
539 | ||
540 | 3 | final Collection usages = documentUsages.values(); |
541 | 3 | for (Iterator iterator = usages.iterator(); iterator.hasNext();) { |
542 | 4 | final DocumentUsage documentUsage = (DocumentUsage) iterator.next(); |
543 | 4 | documentUsage.remove(collectionId); |
544 | } | |
545 | ||
546 | 3 | writeDocumentUsages(documentUsages); |
547 | 3 | } |
548 | ||
549 | /** | |
550 | * This method does update specified CollectionIndex in the persistence. | |
551 | * <p/> | |
552 | * CS section 1.4.4.6 describes the algorithm of this method. | |
553 | * | |
554 | * @param collectionIndex CollectionIndex to update | |
555 | * | |
556 | * @throws IllegalArgumentException if collectionIndex parameter is <tt>null</tt> | |
557 | * @throws IndexPersistenceException if any error happens when updating the collection index in the persistence, | |
558 | * this includes the case, when the specified collectionIndex is not found in the | |
559 | * persistence | |
560 | */ | |
561 | public void updateCollectionIndex(final CollectionIndex collectionIndex) | |
562 | throws IndexPersistenceException { | |
563 | 3 | if (collectionIndex == null) { |
564 | 1 | throw new IllegalArgumentException("The parameter named [collectionIndex] was null."); |
565 | } | |
566 | // There is no real overhead in implementing the method this | |
567 | // way and it avoids a lot of code duplication too | |
568 | 2 | removeCollectionIndex(collectionIndex.getId()); |
569 | 1 | addCollectionIndex(collectionIndex); |
570 | 1 | } |
571 | ||
572 | /** | |
573 | * This method increases the use count value for document index with specified WordSourceId. | |
574 | * | |
575 | * @param wordSourceId WordSourceId of document index which to update use count | |
576 | * | |
577 | * @throws IllegalArgumentException if wordSourceId is <tt>null</tt> | |
578 | * @throws IndexPersistenceException when fails to increase document index use count in the persistence or document | |
579 | * index with id specified does not exist in persistence | |
580 | */ | |
581 | public void increaseDocumentUseCount(final WordSourceId wordSourceId) | |
582 | throws IndexPersistenceException { | |
583 | 8 | if (wordSourceId == null) { |
584 | 1 | throw new IllegalArgumentException("The parameter named [wordSourceId] was null."); |
585 | } | |
586 | ||
587 | 7 | final String documentId = Utils.createIdString(wordSourceId); |
588 | ||
589 | 7 | final Map documentUsages = readDocumentUsages(); |
590 | 7 | final DocumentUsage usage = (DocumentUsage) documentUsages.get(documentId); |
591 | 7 | if (usage == null) { |
592 | 1 | throw new IndexPersistenceException("The index document with the given word source id [" + wordSourceId |
593 | + "], string value is [" + documentId + "] does not exist in the persistence."); | |
594 | } | |
595 | 6 | usage.incUsageCount(); |
596 | ||
597 | 6 | writeDocumentUsages(documentUsages); |
598 | 6 | } |
599 | ||
600 | /** | |
601 | * This method decreases the use count value for document index with specified WordSourceId. | |
602 | * | |
603 | * @param wordSourceId WordSourceId of document index of which to update use count | |
604 | * | |
605 | * @throws IllegalArgumentException if wordSourceId is <tt>null</tt> | |
606 | * @throws IndexPersistenceException when fails to decrease document index use count in the persistence or document | |
607 | * index with id specified does not exist in persistence | |
608 | */ | |
609 | public void decreaseDocumentUseCount(final WordSourceId wordSourceId) | |
610 | throws IndexPersistenceException { | |
611 | 7 | if (wordSourceId == null) { |
612 | 1 | throw new IllegalArgumentException("The parameter named [wordSourceId] was null."); |
613 | } | |
614 | ||
615 | 6 | final String documentId = Utils.createIdString(wordSourceId); |
616 | ||
617 | 6 | final Map documentUsages = readDocumentUsages(); |
618 | 6 | final DocumentUsage usage = (DocumentUsage) documentUsages.get(documentId); |
619 | 6 | if (usage == null) { |
620 | 1 | throw new IndexPersistenceException("The index document with the given word source id [" + wordSourceId |
621 | + "], string value is [" + documentId + "] does not exist in the persistence."); | |
622 | } | |
623 | 5 | usage.decUsageCount(); |
624 | ||
625 | 5 | writeDocumentUsages(documentUsages); |
626 | 5 | } |
627 | ||
628 | /** | |
629 | * This method does return set of WordSourceId of documents that have been persisted in this persistence instance. | |
630 | * <p/> | |
631 | * CS section 1.4.4.1 describes the algorithm of this method. | |
632 | * | |
633 | * @return set of WordSourceId of documents that have been persisted | |
634 | * | |
635 | * @throws IndexPersistenceException when fails to retrieve the ids | |
636 | */ | |
637 | public Set getIndexedDocuments() throws IndexPersistenceException { | |
638 | 4 | final Set ret = new HashSet(); |
639 | 4 | final File[] indexFiles = new File(persistenceDirPath).listFiles( |
640 | new PrefixSuffixFilter(DOC_INDEX_FILE_NAME_PREFIX, "_1.xml")); | |
641 | ||
642 | 4 | if (indexFiles == null) { |
643 | 0 | throw new IndexPersistenceException("An error has occurred while scanning the persistence directory [" |
644 | + persistenceDirPath + "] for matching persistence files."); | |
645 | } | |
646 | ||
647 | 8 | for (int i = 0; i < indexFiles.length; i++) { |
648 | 4 | final WordSourceIdHandler dh = new WordSourceIdHandler(); |
649 | 4 | saxParse(indexFiles[i], dh); |
650 | 4 | ret.add(dh.createWordSourceId()); |
651 | } | |
652 | 4 | return ret; |
653 | } | |
654 | ||
655 | /** | |
656 | * This method does return the count of how many different groups this document is a member of. | |
657 | * | |
658 | * @param wordSourceId WordSourceId of document index of which to retrieve use count | |
659 | * | |
660 | * @return the use count of the document with the given id | |
661 | * | |
662 | * @throws IllegalArgumentException if wordSourceId is <tt>null</tt> | |
663 | * @throws IndexPersistenceException when fails to retrieve document index use count in the persistence or document | |
664 | * index with id specified does not exist in persistence | |
665 | */ | |
666 | public int getDocumentUseCount(final WordSourceId wordSourceId) | |
667 | throws IndexPersistenceException { | |
668 | 20 | if (wordSourceId == null) { |
669 | 1 | throw new IllegalArgumentException("The parameter named [wordSourceId] was null."); |
670 | } | |
671 | ||
672 | 19 | final String documentId = Utils.createIdString(wordSourceId); |
673 | ||
674 | 19 | final Map documentUsages = readDocumentUsages(); |
675 | 19 | final DocumentUsage usage = (DocumentUsage) documentUsages.get(documentId); |
676 | 19 | if (usage == null) { |
677 | 2 | throw new IndexPersistenceException("The index document with the given word source id [" + wordSourceId |
678 | + "], string value is [" + documentId + "] does not exist in the persistence."); | |
679 | } | |
680 | 17 | return usage.getUsageCount(); |
681 | } | |
682 | ||
683 | /** | |
684 | * This method checks whether the given non-<tt>null</tt> non-empty String denotes a Path to a existent and writable | |
685 | * directory. | |
686 | * | |
687 | * @param path the path to be checked | |
688 | * | |
689 | * @return the checked path | |
690 | * | |
691 | * @throws IllegalArgumentException in case the given path violates any of the rules stated above | |
692 | */ | |
693 | private String checkPath(final String path) { | |
694 | 78 | if (path == null) { |
695 | 1 | throw new IllegalArgumentException("The parameter named [path] was null."); |
696 | } | |
697 | 77 | if (path.trim().length() == 0) { |
698 | 1 | throw new IllegalArgumentException("The parameter named [path] was an empty String."); |
699 | } | |
700 | ||
701 | 76 | final File file = new File(path); |
702 | 76 | if (!file.exists()) { |
703 | 2 | throw new IllegalArgumentException("The given path [" + path + "] does not exist."); |
704 | } | |
705 | 74 | if (!file.isDirectory()) { |
706 | 2 | throw new IllegalArgumentException("The given path [" + path + "] was not a directory."); |
707 | } | |
708 | 72 | if (!file.canRead()) { |
709 | 0 | throw new IllegalArgumentException("The given path [" + path + "] is not readable."); |
710 | } | |
711 | 72 | if (!file.canWrite()) { |
712 | 0 | throw new IllegalArgumentException("The given path [" + path + "] is not writable."); |
713 | } | |
714 | 72 | return path; |
715 | } | |
716 | ||
717 | /** | |
718 | * This method retrieves a DocumentIndex with the specified documentId (calculated from the wordSourceId) from the | |
719 | * persistence. It does return <tt>null</tt>, if DocumentIndex with the given WordSourceId is not found in the | |
720 | * persistence. | |
721 | * <p/> | |
722 | * CS section 1.4.4.1 describes the algorithm of this method. | |
723 | * | |
724 | * @param documentId id of document to retrieve | |
725 | * | |
726 | * @return the retrieved DocumentIndex, or <tt>null</tt> if document index with given WordSourceId is not found in | |
727 | * the persistence | |
728 | * | |
729 | * @throws IllegalArgumentException when wordSourceId is <tt>null</tt> | |
730 | * @throws IndexPersistenceException when the implementation fails to retrieve document index with the specified | |
731 | * WordSourceId | |
732 | */ | |
733 | private DocumentIndex getDocumentIndex(final String documentId) throws IndexPersistenceException { | |
734 | 18 | final File[] indexFiles = new File(persistenceDirPath).listFiles( |
735 | new PrefixSuffixFilter(DOC_INDEX_FILE_NAME_PREFIX + documentId, ".xml")); | |
736 | ||
737 | 18 | if (indexFiles == null) { |
738 | 0 | throw new IndexPersistenceException("An error has occurred while scanning the persistence directory [" |
739 | + persistenceDirPath + "] for matching persistence files."); | |
740 | } | |
741 | 18 | if (indexFiles.length == 0) { |
742 | 2 | return null; |
743 | } | |
744 | ||
745 | 16 | final DocumentIndexHandler dh = new DocumentIndexHandler(); |
746 | 35 | for (int i = 0; i < indexFiles.length; i++) { |
747 | 19 | final String expectedFileName = |
748 | DOC_INDEX_FILE_NAME_PREFIX + documentId + "_" + (i + 1) + ".xml"; | |
749 | 19 | final File currentFile = new File(persistenceDirPath, expectedFileName); |
750 | 19 | if (!currentFile.exists()) { |
751 | 0 | throw new IndexPersistenceException( |
752 | "Corrupt Persistence detected: Expected to encounter file with name [" + expectedFileName | |
753 | + "], but next file found has name [" + currentFile.getName() + "]."); | |
754 | ||
755 | } | |
756 | 19 | saxParse(currentFile, dh); |
757 | } | |
758 | ||
759 | 16 | return dh.createDocumentIndex(); |
760 | } | |
761 | ||
762 | /** | |
763 | * This method writes the actual document index data files. | |
764 | * | |
765 | * @param header the header XML-fragment string to be used when starting a new file | |
766 | * @param footer the footer XML-fragment string to be used when ending a file | |
767 | * @param documentID the document id calculated from the document index | |
768 | * @param documentIndex the index to be written to the files | |
769 | * | |
770 | * @throws IndexPersistenceException in case the operation fails | |
771 | */ | |
772 | private void writeDocumentDataFiles(final String header, final String footer, final String documentID, | |
773 | final DocumentIndex documentIndex) | |
774 | throws | |
775 | IndexPersistenceException { | |
776 | 23 | OutputStream outputStream = null; |
777 | try { | |
778 | 23 | final byte[] headerData = header.getBytes(UTF_8_CHARSET); |
779 | 23 | final byte[] footerData = footer.getBytes(UTF_8_CHARSET); |
780 | ||
781 | //start the initial file here | |
782 | 23 | int fileSequence = 1; |
783 | 23 | outputStream = new BufferedOutputStream( |
784 | new FileOutputStream(new File(persistenceDirPath, DOC_INDEX_FILE_NAME_PREFIX + documentID + "_" | |
785 | + (fileSequence++) + ".xml"))); | |
786 | 23 | outputStream.write(headerData); |
787 | 23 | int bytesWritten = headerData.length; |
788 | ||
789 | // iterate the words and write them out | |
790 | 23 | final Set set = documentIndex.getWords().entrySet(); |
791 | 23 | for (Iterator iterator = set.iterator(); iterator.hasNext();) { |
792 | 11060 | final Map.Entry entry = (Map.Entry) iterator.next(); |
793 | 11060 | final List indices = (List) entry.getValue(); |
794 | 11060 | final StringBuffer wordData = new StringBuffer(); |
795 | 11060 | wordData.append("<word-index><word>"); |
796 | 11060 | wordData.append(escapeForXML(((CollationKey) entry.getKey()).getSourceString())); |
797 | 11060 | wordData.append("</word>"); |
798 | 11060 | for (Iterator iterator1 = indices.iterator(); iterator1.hasNext();) { |
799 | 22180 | wordData.append("<pos>"); |
800 | 22180 | wordData.append((Integer) iterator1.next()); |
801 | 22180 | wordData.append("</pos>"); |
802 | } | |
803 | 11060 | wordData.append("</word-index>"); |
804 | 11060 | final byte[] wordEntry = wordData.toString().getBytes(UTF_8_CHARSET); |
805 | ||
806 | // we have to start a new file in case | |
807 | // the data to be written would exceed the file size threshold | |
808 | 11060 | if (bytesWritten + wordEntry.length + footerData.length >= fileSizeLimit - fileSizeThreshold) { |
809 | //close previous file | |
810 | 3 | outputStream.write(footerData); |
811 | 3 | outputStream.close(); |
812 | ||
813 | //create new file | |
814 | 3 | outputStream = new BufferedOutputStream(new FileOutputStream(new File(persistenceDirPath, |
815 | DOC_INDEX_FILE_NAME_PREFIX + documentID + "_" | |
816 | + (fileSequence++) + ".xml"))); | |
817 | ||
818 | //write header | |
819 | 3 | outputStream.write(headerData); |
820 | 3 | bytesWritten = headerData.length; |
821 | } | |
822 | ||
823 | //now write the word entry data to current file | |
824 | 11060 | outputStream.write(wordEntry); |
825 | 11060 | bytesWritten += wordEntry.length; |
826 | } | |
827 | 23 | outputStream.write(footerData); |
828 | 23 | outputStream.flush(); |
829 | 0 | } catch (IOException e) { |
830 | 0 | throw new IndexPersistenceException("Error while persisting index [" + documentIndex + "] to file.", e); |
831 | } finally { | |
832 | 23 | safeClose(outputStream); |
833 | 23 | } |
834 | 23 | } |
835 | ||
836 | /** | |
837 | * This method closes the given output stream, ignoring all exceptions that may occur during this operation . | |
838 | * | |
839 | * @param outputStream the stream to be closed, may be <tt>null</tt> | |
840 | */ | |
841 | private static void safeClose(final OutputStream outputStream) { | |
842 | 78 | if (outputStream != null) { |
843 | try { | |
844 | 78 | outputStream.close(); |
845 | 0 | } catch (IOException e) { |
846 | // There is nothing we can do here, as we may have come here due to an exception, we | |
847 | // cannot re-throw this one as it would mask the original exception. | |
848 | 78 | } |
849 | } | |
850 | 78 | } |
851 | ||
852 | /** | |
853 | * This method creates a new SAXParser and lets it parse the given file using the given DefaultHandler. | |
854 | * | |
855 | * @param file the file to be parsed | |
856 | * @param defaultHandler the handler to be used when parsing | |
857 | * | |
858 | * @throws IndexPersistenceException in case the parse operation fails | |
859 | * @throws IllegalArgumentException incase any arg is <tt>null</tt> | |
860 | */ | |
861 | private static void saxParse(final File file, final DefaultHandler defaultHandler) | |
862 | throws IndexPersistenceException { | |
863 | 167 | if (file == null) { |
864 | 0 | throw new IllegalArgumentException("The parameter named [file] was null."); |
865 | } | |
866 | 167 | if (defaultHandler == null) { |
867 | 0 | throw new IllegalArgumentException("The parameter named [defaultHandler] was null."); |
868 | } | |
869 | ||
870 | final SAXParser saxParser; | |
871 | try { | |
872 | 167 | saxParser = SAXParserFactory.newInstance().newSAXParser(); |
873 | 0 | } catch (ParserConfigurationException e) { |
874 | 0 | throw new IndexPersistenceException("Error while creating SAX parser.", e); |
875 | 0 | } catch (SAXException e) { |
876 | 0 | throw new IndexPersistenceException("Error while creating SAX parser.", e); |
877 | 167 | } |
878 | try { | |
879 | 167 | saxParser.parse(file, defaultHandler); |
880 | 0 | } catch (SAXException e) { |
881 | 0 | throw new IndexPersistenceException("Error while parsing input file [" + file.getName() + "] .", |
882 | e); | |
883 | ||
884 | 0 | } catch (IOException e) { |
885 | 0 | throw new IndexPersistenceException("Error while parsing input file [" + file.getName() + "] .", |
886 | e); | |
887 | 167 | } |
888 | 167 | } |
889 | ||
890 | /** | |
891 | * This utility method retrieves an attribute value from the given attributes list, throwing a SAXException in case | |
892 | * the attribute does not exist. | |
893 | * | |
894 | * @param attributes the attributes list to retrieve the value from | |
895 | * @param attrName the attribute name | |
896 | * @param tagName the current tag name, used in error messages | |
897 | * | |
898 | * @return the attribute value | |
899 | * | |
900 | * @throws SAXException in case the attribute does not exist | |
901 | */ | |
902 | private static String getAttribute(final Attributes attributes, final String attrName, final String tagName) | |
903 | throws SAXException { | |
904 | 243 | final String attributeValue = attributes.getValue(attrName); |
905 | 243 | if (attributeValue == null) { |
906 | 0 | throw new SAXException( |
907 | "The required attribute [" + attrName + "] of tag [" + tagName + "] was missing."); | |
908 | } | |
909 | 243 | return attributeValue; |
910 | } | |
911 | ||
912 | /** | |
913 | * This method writes the document_usage.xml from the given map, which is expected to contain values of type {@link | |
914 | * DocumentUsage}. All these values are written to the file. | |
915 | * | |
916 | * @param documentUsages the document usages map to be written | |
917 | * | |
918 | * @throws IndexPersistenceException in case the operation fails | |
919 | */ | |
920 | private void writeDocumentUsages(final Map documentUsages) throws IndexPersistenceException { | |
921 | 47 | final StringBuffer fileContent = new StringBuffer(); |
922 | 47 | final Collection usages = documentUsages.values(); |
923 | 47 | fileContent.append("<?xml version=\"1.0\" encoding=\"" + UTF_8_CHARSET + "\" standalone=\"yes\"?><documents>"); |
924 | 47 | for (Iterator iterator = usages.iterator(); iterator.hasNext();) { |
925 | 51 | final DocumentUsage documentUsage = (DocumentUsage) iterator.next(); |
926 | 51 | fileContent.append("<document-usage document-id=\""); |
927 | 51 | fileContent.append(escapeForXML(documentUsage.getDocumentId())); |
928 | 51 | fileContent.append("\" usage-count=\""); |
929 | 51 | fileContent.append(documentUsage.getUsageCount()); |
930 | 51 | fileContent.append("\"><collection-inclusion>"); |
931 | 51 | final String[] collectionIds = documentUsage.getCollectionIds(); |
932 | 64 | for (int i = 0; i < collectionIds.length; i++) { |
933 | 13 | final String id = collectionIds[i]; |
934 | 13 | fileContent.append("<collection collection-id=\""); |
935 | 13 | fileContent.append(escapeForXML(id)); |
936 | 13 | fileContent.append("\"/>"); |
937 | } | |
938 | 51 | fileContent.append("</collection-inclusion>"); |
939 | 51 | fileContent.append("</document-usage>"); |
940 | } | |
941 | 47 | fileContent.append("</documents>"); |
942 | 47 | writeUTF8File(fileContent.toString(), new File(persistenceDirPath, DOC_USAGE_FILE_NAME)); |
943 | 47 | } |
944 | ||
945 | /** | |
946 | * This method writes the given String to the given file, encoding the content as UTF-8. | |
947 | * | |
948 | * @param fileContent the content to be written | |
949 | * @param persistenceFile the file to write to | |
950 | * | |
951 | * @throws IndexPersistenceException in case the operation fails | |
952 | */ | |
953 | private static void writeUTF8File(final String fileContent, final File persistenceFile) | |
954 | throws IndexPersistenceException { | |
955 | 55 | OutputStream outputStream = null; |
956 | try { | |
957 | 55 | final byte[] fileData = fileContent.getBytes(UTF_8_CHARSET); |
958 | 55 | outputStream = new FileOutputStream(persistenceFile); |
959 | 55 | outputStream.write(fileData); |
960 | 0 | } catch (IOException e) { |
961 | 0 | throw new IndexPersistenceException( |
962 | "Error while writing persistence file [" + persistenceFile.getName() + "].", e); | |
963 | } finally { | |
964 | 55 | safeClose(outputStream); |
965 | 55 | } |
966 | 55 | } |
967 | ||
968 | /** | |
969 | * This method does parse the document_usage.xml file. The content is returned as a map containing String keys being | |
970 | * the documentIds and values of type {@link DocumentUsage}. | |
971 | * | |
972 | * @return the map representing the content of the document_usage.xml file | |
973 | * | |
974 | * @throws IndexPersistenceException in case the operation fails | |
975 | */ | |
976 | private Map readDocumentUsages() throws IndexPersistenceException { | |
977 | 140 | final File docUsageFile = new File(persistenceDirPath, DOC_USAGE_FILE_NAME); |
978 | 140 | if (docUsageFile.exists()) { |
979 | 138 | final DocumentUsageHandler dh = new DocumentUsageHandler(); |
980 | 138 | saxParse(docUsageFile, dh); |
981 | 138 | return dh.getDocumentUsages(); |
982 | } else { | |
983 | 2 | throw new IndexPersistenceException("The document usage file,which was expected to exist at [" |
984 | + docUsageFile.getAbsolutePath() + "] did not exist."); | |
985 | } | |
986 | } | |
987 | ||
988 | /** | |
989 | * Replace characters having special meaning in XML with their escaped equivalents, using character entities such as | |
990 | * <tt>'&amp;'</tt> or numeric entities such as<tt>'&#12;'</tt>. | |
991 | * <p/> | |
992 | * The filtering whether a character needs to be escaped is done with respect to the fact that the output is written | |
993 | * as UTF-8 encoding, i.e. two-byte characters do not need to be escaped. | |
994 | * <p/> | |
995 | * See: <ul><li>http://www.w3.org/TR/2000/REC-xml-20001006#syntax</li> | |
996 | * <p/> | |
997 | * <li>http://www.w3.org/TR/2000/REC-xml-20001006#sec-predefined-ent</li> | |
998 | * <p/> | |
999 | * <li>http://www.w3.org/TR/2000/REC-xml-20001006#sec-entexpand</li></ul> | |
1000 | * | |
1001 | * @param string the String to be escaped | |
1002 | * | |
1003 | * @return the escaped version of the given String | |
1004 | */ | |
1005 | private static String escapeForXML(final String string) { | |
1006 | 11331 | final StringBuffer result = new StringBuffer(); |
1007 | ||
1008 | 11331 | final StringCharacterIterator iterator = new StringCharacterIterator(string); |
1009 | 11331 | char ch = iterator.current(); |
1010 | 100604 | while (ch != StringCharacterIterator.DONE) { |
1011 | 89273 | switch (ch) { |
1012 | case '<': | |
1013 | 20 | result.append("<"); |
1014 | 20 | break; |
1015 | case '>': | |
1016 | 20 | result.append(">"); |
1017 | 20 | break; |
1018 | case '"': | |
1019 | 20 | result.append("""); |
1020 | 20 | break; |
1021 | case '\'': | |
1022 | 19 | result.append("'"); |
1023 | 19 | break; |
1024 | case '&': | |
1025 | 1 | result.append("&"); |
1026 | 1 | break; |
1027 | default: | |
1028 | // If the character is not printable, print as character reference. | |
1029 | // Non printables are below ASCII space | |
1030 | 89193 | if (ch < ' ' || ch == '\u00f7') { |
1031 | 4 | result.append("&#"); |
1032 | 4 | result.append(Integer.toString(ch)); |
1033 | 4 | result.append(';'); |
1034 | } else { | |
1035 | 89189 | result.append(ch); |
1036 | } | |
1037 | } | |
1038 | ||
1039 | 89273 | ch = iterator.next(); |
1040 | } | |
1041 | 11331 | return result.toString(); |
1042 | } | |
1043 | ||
1044 | /** | |
1045 | * This is a simple file filter that matches all files with the given prefix and suffix. | |
1046 | */ | |
1047 | private static class PrefixSuffixFilter implements FileFilter { | |
1048 | /** | |
1049 | * This is the prefix to be matched by the filter. | |
1050 | */ | |
1051 | private final String prefix; | |
1052 | /** | |
1053 | * This is the suffix to be matched by the filter. | |
1054 | */ | |
1055 | private final String suffix; | |
1056 | ||
1057 | /** | |
1058 | * Creates new PrefixSuffixFilter that matches the given prefix and suffix. | |
1059 | * | |
1060 | * @param prefix the prefix to be matched | |
1061 | * @param suffix the suffix to be matched | |
1062 | * | |
1063 | * @throws IllegalArgumentException if any arg is <tt>null</tt> | |
1064 | */ | |
1065 | private PrefixSuffixFilter(final String prefix, final String suffix) { | |
1066 | if (prefix == null) { | |
1067 | throw new IllegalArgumentException("The parameter named [prefix] was null."); | |
1068 | } | |
1069 | if (suffix == null) { | |
1070 | throw new IllegalArgumentException("The parameter named [suffix] was null."); | |
1071 | } | |
1072 | ||
1073 | this.prefix = prefix; | |
1074 | this.suffix = suffix; | |
1075 | } | |
1076 | ||
1077 | /** | |
1078 | * Tests whether or not the specified abstract path name should be included in a path name list. | |
1079 | * | |
1080 | * @param pathname The abstract path name to be tested | |
1081 | * | |
1082 | * @return <code>true</code> if and only if <code>pathname</code> should be included | |
1083 | */ | |
1084 | public boolean accept(final File pathname) { | |
1085 | final String name = pathname.getName(); | |
1086 | return name.startsWith(prefix) && name.endsWith(suffix); | |
1087 | } | |
1088 | } | |
1089 | ||
1090 | /** | |
1091 | * This DefaultHandler instance is used to parse document index XML files. | |
1092 | */ | |
1093 | private static class DocumentIndexHandler extends WordSourceIdHandler { | |
1094 | /** | |
1095 | * This is a state flag which signals whether we are between the start and the end of an <em>word</em>. This | |
1096 | * information is used in {@link #characters(char[], int, int)} to determine whether the characters are child | |
1097 | * element of that element or not. | |
1098 | */ | |
1099 | private boolean inWord = false; | |
1100 | /** | |
1101 | * This is a state flag which signals whether we are between the start and the end of an <em>pos</em>. This | |
1102 | * information is used in {@link #characters(char[], int, int)} to determine whether the characters are child | |
1103 | * element of that element or not. | |
1104 | */ | |
1105 | private boolean inPos = false; | |
1106 | ||
1107 | /** | |
1108 | * This is the collator instance matching the document index's locale. It is used for converting words to {@link | |
1109 | * CollationKey}s. as this instance is constructed after the <em>word-source-data</em> element has ended, the | |
1110 | * case that the field is not <tt>null</tt> any more does furthermore signal that a <em>word-source-data</em> | |
1111 | * element has been parsed fully by the super class and thus no more event must be delegated to the super | |
1112 | * class. | |
1113 | */ | |
1114 | private Collator collator = null; | |
1115 | ||
1116 | /** | |
1117 | * This field contains the parsed words of the index as a map from {@link java.text.CollationKey} to a {@link | |
1118 | * java.util.List} of {@link Integer}s being the indices. | |
1119 | */ | |
1120 | ||
1121 | private Map words = new HashMap(); | |
1122 | ||
1123 | /** | |
1124 | * This field contains the currently parsed word, if any. | |
1125 | */ | |
1126 | private String currentWord = null; | |
1127 | /** | |
1128 | * This field contains the currently parsed indices, if any. | |
1129 | */ | |
1130 | private List currentIndices = null; | |
1131 | /** | |
1132 | * This field contains the currently parsed pos value. | |
1133 | */ | |
1134 | private String currentPos = null; | |
1135 | ||
1136 | /** | |
1137 | * This method is called by the parser when a document start element is encountered. | |
1138 | * | |
1139 | * @param uri the uri of the element encountered | |
1140 | * @param localName the local name of the element encountered | |
1141 | * @param qName the qualified name of the element encountered | |
1142 | * @param attributes the attributes of the element encountered | |
1143 | * | |
1144 | * @throws SAXException in case some error occurs | |
1145 | */ | |
1146 | public void startElement(final String uri, final String localName, final String qName, | |
1147 | final Attributes attributes) | |
1148 | throws SAXException { | |
1149 | //delegate to super class as long as no full word-source-data element has been parsed | |
1150 | if (collator == null) { | |
1151 | super.startElement(uri, localName, qName, attributes); | |
1152 | } | |
1153 | ||
1154 | if ("pos".equals(qName)) { | |
1155 | inPos = true; | |
1156 | } else if ("word".equals(qName)) { | |
1157 | inWord = true; | |
1158 | } else if ("word-index".equals(qName)) { | |
1159 | if (collator == null) { | |
1160 | throw new SAXException( | |
1161 | "There was no [word-source-data] specified prior to the [word-index] element."); | |
1162 | } | |
1163 | currentIndices = new ArrayList(); | |
1164 | } | |
1165 | } | |
1166 | ||
1167 | /** | |
1168 | * This method is called by the parser when a TEXT element is encountered. | |
1169 | * | |
1170 | * @param ch the characters encountered | |
1171 | * @param start the start index in ch | |
1172 | * @param length the length in ch | |
1173 | * | |
1174 | * @throws SAXException in case some error occurs | |
1175 | */ | |
1176 | public void characters(final char[] ch, final int start, final int length) throws SAXException { | |
1177 | //delegate to super class as long as no full word-source-data element has been parsed | |
1178 | if (collator == null) { | |
1179 | super.characters(ch, start, length); | |
1180 | } | |
1181 | ||
1182 | if (inPos) { | |
1183 | final String s = new String(ch, start, length); | |
1184 | currentPos = currentPos == null ? s : currentPos + s; | |
1185 | } else if (inWord) { | |
1186 | final String s = new String(ch, start, length); | |
1187 | currentWord = currentWord == null ? s : currentWord + s; | |
1188 | } | |
1189 | } | |
1190 | ||
1191 | /** | |
1192 | * This method is called by the parser when a document end element is encountered. | |
1193 | * | |
1194 | * @param uri the uri of the element encountered | |
1195 | * @param localName the local name of the element encountered | |
1196 | * @param qName the qualified name of the element encountered | |
1197 | * | |
1198 | * @throws SAXException in case some error occurs | |
1199 | */ | |
1200 | public void endElement(final String uri, final String localName, final String qName) throws SAXException { | |
1201 | // delegate to super class as long as no full word-source-data element has been parsed | |
1202 | if (collator == null) { | |
1203 | super.endElement(uri, localName, qName); | |
1204 | } | |
1205 | ||
1206 | if ("pos".equals(qName)) { | |
1207 | if (currentPos == null) { | |
1208 | throw new SAXException( | |
1209 | "The child text element of tag [pos] was expected to contain non-whitespace data."); | |
1210 | } | |
1211 | try { | |
1212 | currentIndices.add(Integer.valueOf(currentPos.trim())); | |
1213 | } catch (NumberFormatException e) { | |
1214 | throw new SAXException( | |
1215 | "The found [pos] tag value [" + currentPos + "] was not parseable into a valid int.", e); | |
1216 | } | |
1217 | inPos = false; | |
1218 | currentPos = null; | |
1219 | } else if ("word".equals(qName)) { | |
1220 | if (currentWord == null) { | |
1221 | throw new SAXException( | |
1222 | "The child text element of tag [word] was expected to contain non-whitespace data."); | |
1223 | } | |
1224 | inWord = false; | |
1225 | } else if ("word-index".equals(qName)) { | |
1226 | // full entry ended, put it to map. | |
1227 | words.put(collator.getCollationKey(currentWord), currentIndices); | |
1228 | currentWord = null; | |
1229 | } else if ("word-source-data".equals(qName)) { | |
1230 | //super class has done its job, now we can create our collator. | |
1231 | try { | |
1232 | collator = Collator.getInstance(createWordSourceId().getSourceLocale()); | |
1233 | } catch (IndexPersistenceException e) { | |
1234 | throw new SAXException("document index header contained unexpected data.", e); | |
1235 | } | |
1236 | } | |
1237 | } | |
1238 | ||
1239 | /** | |
1240 | * This method creates a DocumentIndex instance from the data encountered during parsing. | |
1241 | * | |
1242 | * @return the instance created | |
1243 | * | |
1244 | * @throws IndexPersistenceException in case not enough data for creating the instance has been encountered | |
1245 | * during parsing or the creation of the instance fails | |
1246 | */ | |
1247 | private DocumentIndex createDocumentIndex() throws IndexPersistenceException { | |
1248 | final WordSourceId sourceId = createWordSourceId(); | |
1249 | return new DocumentIndex(sourceId, words); | |
1250 | } | |
1251 | } | |
1252 | ||
1253 | /** | |
1254 | * This is a default handler for parsing WordSourceIds. | |
1255 | */ | |
1256 | private static class WordSourceIdHandler extends DefaultHandler { | |
1257 | /** | |
1258 | * This is a state flag which signals whether we are between the start and the end of an | |
1259 | * <em>source-identity</em>. This information is used in {@link #characters(char[], int, int)} to determine | |
1260 | * whether the characters are child element of that element or not. | |
1261 | */ | |
1262 | private boolean inSourceIdentity = false; | |
1263 | /** | |
1264 | * This is a state flag which signals whether we are between the start and the end of an <em>class-name</em>. | |
1265 | * This information is used in {@link #characters(char[], int, int)} to determine whether the characters are | |
1266 | * child element of that element or not. | |
1267 | */ | |
1268 | private boolean inClassName = false; | |
1269 | ||
1270 | /** | |
1271 | * This is a state flag which signals whether we are between the start and the end of an | |
1272 | * <em>word-source-data</em>. This information is used as an optimization that will disable a lot of unnecessary | |
1273 | * comparison in case the parser is outside the parent element this instance is responsible for. | |
1274 | */ | |
1275 | private boolean inScope = false; | |
1276 | ||
1277 | /** | |
1278 | * This field contains the parsed value of the document index source identity. | |
1279 | */ | |
1280 | private String sourceIdentityValue = null; | |
1281 | /** | |
1282 | * This field contains the parsed value of the document index identity class name. | |
1283 | */ | |
1284 | private String classNameValue = null; | |
1285 | ||
1286 | /** | |
1287 | * This field contains the parsed value of the document index identity locale. | |
1288 | */ | |
1289 | private Locale locale = null; | |
1290 | ||
1291 | /** | |
1292 | * This field contains the parsed value of the document index identity delimiters. | |
1293 | */ | |
1294 | private List delimiters = new ArrayList(); | |
1295 | ||
1296 | /** | |
1297 | * This method is called by the parser when a document start element is encountered. | |
1298 | * | |
1299 | * @param uri the uri of the element encountered | |
1300 | * @param localName the local name of the element encountered | |
1301 | * @param qName the qualified name of the element encountered | |
1302 | * @param attributes the attributes of the element encountered | |
1303 | * | |
1304 | * @throws SAXException in case some error occurs | |
1305 | */ | |
1306 | public void startElement(final String uri, final String localName, final String qName, | |
1307 | final Attributes attributes) | |
1308 | throws SAXException { | |
1309 | if (inScope) { | |
1310 | if ("source-identity".equals(qName)) { | |
1311 | inSourceIdentity = true; | |
1312 | } else if ("class-name".equals(qName)) { | |
1313 | inClassName = true; | |
1314 | } else if ("locale".equals(qName) && locale == null) { | |
1315 | final String language = getAttribute(attributes, "language", "locale"); | |
1316 | final String country = getAttribute(attributes, "country", "locale"); | |
1317 | final String variant = getAttribute(attributes, "variant", "locale"); | |
1318 | locale = new Locale(language, country, variant); | |
1319 | } else if ("entry".equals(qName)) { | |
1320 | delimiters.add(getAttribute(attributes, "delimiter", "entry")); | |
1321 | } | |
1322 | } else if ("word-source-data".equals(qName)) { | |
1323 | inScope = true; | |
1324 | } | |
1325 | } | |
1326 | ||
1327 | /** | |
1328 | * This method is called by the parser when a TEXT element is encountered. | |
1329 | * | |
1330 | * @param ch the characters encountered | |
1331 | * @param start the start index in ch | |
1332 | * @param length the length in ch | |
1333 | * | |
1334 | * @throws SAXException in case some error occurs | |
1335 | */ | |
1336 | public void characters(final char[] ch, final int start, final int length) throws SAXException { | |
1337 | if (inScope) { | |
1338 | final String s = new String(ch, start, length); | |
1339 | if (inSourceIdentity) { | |
1340 | sourceIdentityValue = sourceIdentityValue == null ? s : sourceIdentityValue + s; | |
1341 | } else if (inClassName) { | |
1342 | classNameValue = classNameValue == null ? s : classNameValue + s; | |
1343 | } | |
1344 | } | |
1345 | } | |
1346 | ||
1347 | /** | |
1348 | * This method is called by the parser when a document end element is encountered. | |
1349 | * | |
1350 | * @param uri the uri of the element encountered | |
1351 | * @param localName the local name of the element encountered | |
1352 | * @param qName the qualified name of the element encountered | |
1353 | * | |
1354 | * @throws SAXException in case some error occurs | |
1355 | */ | |
1356 | public void endElement(final String uri, final String localName, final String qName) throws SAXException { | |
1357 | if (inScope) { | |
1358 | if ("source-identity".equals(qName)) { | |
1359 | if (sourceIdentityValue == null) { | |
1360 | throw new SAXException( | |
1361 | "The child text element of tag [source-identity] was expected " | |
1362 | + "to contain non-whitespace data."); | |
1363 | } | |
1364 | inSourceIdentity = false; | |
1365 | } else if ("class-name".equals(qName)) { | |
1366 | if (classNameValue == null) { | |
1367 | throw new SAXException( | |
1368 | "The child text element of tag [class-name] was expected to contain non-whitespace data."); | |
1369 | } | |
1370 | inClassName = false; | |
1371 | } else if ("delimiters".equals(qName)) { | |
1372 | if (delimiters.isEmpty()) { | |
1373 | throw new SAXException("At least one [entry] element is expected in the [delimiters] element."); | |
1374 | } | |
1375 | } else if ("word-source-data".equals(qName)) { | |
1376 | if (sourceIdentityValue == null) { | |
1377 | throw new SAXException( | |
1378 | "There was no [source-identity] element in the [word-source-data] element."); | |
1379 | } | |
1380 | if (classNameValue == null) { | |
1381 | throw new SAXException("There was no [class-name] element in the [word-source-data] element."); | |
1382 | } | |
1383 | if (locale == null) { | |
1384 | throw new SAXException("There was no [locale] element in the [word-source-data] element."); | |
1385 | } | |
1386 | inScope = false; | |
1387 | } | |
1388 | ||
1389 | } | |
1390 | } | |
1391 | ||
1392 | /** | |
1393 | * This method creates a WordSourceId from the data encountered during the parse process. | |
1394 | * | |
1395 | * @return the WordSourceId parsed | |
1396 | * | |
1397 | * @throws IndexPersistenceException in case not enough data for creating the instance has been encountered | |
1398 | * during parsing or the creation of the instance fails | |
1399 | */ | |
1400 | public WordSourceId createWordSourceId() throws IndexPersistenceException { | |
1401 | if (sourceIdentityValue == null) { | |
1402 | throw new IndexPersistenceException("No [source-identity] value encountered during parsing."); | |
1403 | } | |
1404 | if (classNameValue == null) { | |
1405 | throw new IndexPersistenceException("No [class-name] value encountered during parsing."); | |
1406 | } | |
1407 | if (classNameValue.trim().length() == 0) { | |
1408 | throw new IndexPersistenceException("Empty [class-name] value encountered during parsing."); | |
1409 | } | |
1410 | if (locale == null) { | |
1411 | throw new IndexPersistenceException("No [locale] value encountered during parsing."); | |
1412 | } | |
1413 | if (delimiters.isEmpty()) { | |
1414 | throw new IndexPersistenceException("No [delimiters] value encountered during parsing."); | |
1415 | } | |
1416 | ||
1417 | //decode the sourceIdentity object from the Base64 String | |
1418 | final Base64Decoder base64Decoder = new Base64Decoder(Base64Codec.IGNORE_ALL, null, | |
1419 | Base64Codec.STANDARD_ALPHABET); | |
1420 | final byte[] bytes = sourceIdentityValue.getBytes(); | |
1421 | final byte[] out = new byte[bytes.length]; | |
1422 | base64Decoder.setInput(bytes); | |
1423 | int byteCount = 0; | |
1424 | try { | |
1425 | byteCount = base64Decoder.inflate(out); | |
1426 | } catch (DataFormatException e) { | |
1427 | throw new IndexPersistenceException( | |
1428 | "Error while decoding the Base64 data found in the [source-identity] tag.", e); | |
1429 | } | |
1430 | final byte[] serializedData = new byte[byteCount]; | |
1431 | System.arraycopy(out, 0, serializedData, 0, byteCount); | |
1432 | final Object sourceIdentity = Utils.deSerializeObject(serializedData); | |
1433 | ||
1434 | // list to array | |
1435 | final String[] delims = (String[]) delimiters.toArray(new String[delimiters.size()]); | |
1436 | ||
1437 | // build the actual WordSourceId | |
1438 | return new WordSourceId(sourceIdentity, classNameValue, delims, locale); | |
1439 | } | |
1440 | } | |
1441 | ||
1442 | /** | |
1443 | * This is a DefaultHandler for parsing CollectionIndex instances. | |
1444 | */ | |
1445 | private static class DocumentCollectionHandler extends DefaultHandler { | |
1446 | /** | |
1447 | * This is a state flag which signals whether we are between the start and the end of an <em>document-id</em>. | |
1448 | * This information is used in {@link #characters(char[], int, int)} to determine whether the characters are | |
1449 | * child element of that element or not. | |
1450 | */ | |
1451 | private boolean inDocumentId = false; | |
1452 | /** | |
1453 | * This field contains the parsed document ids. | |
1454 | */ | |
1455 | ||
1456 | private List documentIds = new ArrayList(); | |
1457 | ||
1458 | /** | |
1459 | * This field contains the currently parsed document, if any. | |
1460 | */ | |
1461 | private String currentDocumentId = null; | |
1462 | ||
1463 | /** | |
1464 | * This method is called by the parser when a document start element is encountered. | |
1465 | * | |
1466 | * @param uri the uri of the element encountered | |
1467 | * @param localName the local name of the element encountered | |
1468 | * @param qName the qualified name of the element encountered | |
1469 | * @param attributes the attributes of the element encountered | |
1470 | * | |
1471 | * @throws SAXException in case some error occurs | |
1472 | */ | |
1473 | public void startElement(final String uri, final String localName, final String qName, | |
1474 | final Attributes attributes) | |
1475 | throws SAXException { | |
1476 | if ("document-id".equals(qName)) { | |
1477 | inDocumentId = true; | |
1478 | } | |
1479 | } | |
1480 | ||
1481 | /** | |
1482 | * This method is called by the parser when a TEXT element is encountered. | |
1483 | * | |
1484 | * @param ch the characters encountered | |
1485 | * @param start the start index in ch | |
1486 | * @param length the length in ch | |
1487 | * | |
1488 | * @throws SAXException in case some error occurs | |
1489 | */ | |
1490 | public void characters(final char[] ch, final int start, final int length) throws SAXException { | |
1491 | if (inDocumentId) { | |
1492 | final String s = new String(ch, start, length); | |
1493 | currentDocumentId = currentDocumentId == null ? s : currentDocumentId + s; | |
1494 | } | |
1495 | } | |
1496 | ||
1497 | /** | |
1498 | * This method is called by the parser when a document end element is encountered. | |
1499 | * | |
1500 | * @param uri the uri of the element encountered | |
1501 | * @param localName the local name of the element encountered | |
1502 | * @param qName the qualified name of the element encountered | |
1503 | * | |
1504 | * @throws SAXException in case some error occurs | |
1505 | */ | |
1506 | public void endElement(final String uri, final String localName, final String qName) throws SAXException { | |
1507 | if ("document-id".equals(qName)) { | |
1508 | if (currentDocumentId == null) { | |
1509 | throw new SAXException( | |
1510 | "The child text element of tag [document-id] was expected to contain non-whitespace data."); | |
1511 | } | |
1512 | documentIds.add(currentDocumentId.trim()); | |
1513 | inDocumentId = false; | |
1514 | currentDocumentId = null; | |
1515 | } | |
1516 | } | |
1517 | ||
1518 | /** | |
1519 | * This method returns all document ids found during parsing the document collection. | |
1520 | * | |
1521 | * @return the document ids found during parsing | |
1522 | */ | |
1523 | private String[] getDocumentIds() { | |
1524 | return (String[]) documentIds.toArray(new String[documentIds.size()]); | |
1525 | } | |
1526 | } | |
1527 | ||
1528 | /** | |
1529 | * This is the DefaultHandler used for parsing the document_usage.xml. | |
1530 | */ | |
1531 | private static class DocumentUsageHandler extends DefaultHandler { | |
1532 | /** | |
1533 | * This field contains the parsed usages for documents. | |
1534 | */ | |
1535 | ||
1536 | private Map usages = new HashMap(); | |
1537 | ||
1538 | /** | |
1539 | * This field contains the currently parsed document, if any. | |
1540 | */ | |
1541 | private DocumentUsage currentDocumentUsage = null; | |
1542 | ||
1543 | /** | |
1544 | * This method is called by the parser when a document start element is encountered. | |
1545 | * | |
1546 | * @param uri the uri of the element encountered | |
1547 | * @param localName the local name of the element encountered | |
1548 | * @param qName the qualified name of the element encountered | |
1549 | * @param attributes the attributes of the element encountered | |
1550 | * | |
1551 | * @throws SAXException in case some error occurs | |
1552 | */ | |
1553 | public void startElement(final String uri, final String localName, final String qName, | |
1554 | final Attributes attributes) | |
1555 | throws SAXException { | |
1556 | if ("document-usage".equals(qName)) { | |
1557 | final String documentId = getAttribute(attributes, "document-id", "document-usage"); | |
1558 | final String countStr = getAttribute(attributes, "usage-count", "document-usage"); | |
1559 | final int count; | |
1560 | try { | |
1561 | count = Integer.parseInt(countStr.trim()); | |
1562 | } catch (NumberFormatException e) { | |
1563 | throw new SAXException("The found [count] tag attribute value [" | |
1564 | + countStr + "] was not parseable into a valid int.", e); | |
1565 | } | |
1566 | currentDocumentUsage = new DocumentUsage(documentId, count); | |
1567 | } else if ("collection".equals(qName)) { | |
1568 | if (currentDocumentUsage == null) { | |
1569 | throw new SAXException("Illegal structure encountered, expected a [document-usage] " | |
1570 | + "tag to exist as parent of a [collection] tag."); | |
1571 | } | |
1572 | currentDocumentUsage.add(getAttribute(attributes, "collection-id", "collection")); | |
1573 | } | |
1574 | } | |
1575 | ||
1576 | /** | |
1577 | * This method is called by the parser when a document end element is encountered. | |
1578 | * | |
1579 | * @param uri the uri of the element encountered | |
1580 | * @param localName the local name of the element encountered | |
1581 | * @param qName the qualified name of the element encountered | |
1582 | * | |
1583 | * @throws SAXException in case some error occurs | |
1584 | */ | |
1585 | public void endElement(final String uri, final String localName, final String qName) throws SAXException { | |
1586 | if ("document-usage".equals(qName)) { | |
1587 | usages.put(currentDocumentUsage.getDocumentId(), currentDocumentUsage); | |
1588 | currentDocumentUsage = null; | |
1589 | } | |
1590 | } | |
1591 | ||
1592 | /** | |
1593 | * This method does return the document usages parsed from the file. | |
1594 | * | |
1595 | * @return the document usages parsed from the file as map from String to {@link DocumentUsage} | |
1596 | */ | |
1597 | private Map getDocumentUsages() { | |
1598 | return usages; | |
1599 | } | |
1600 | } | |
1601 | ||
1602 | /** | |
1603 | * This class represents the document usage elements in document_usage.xml. | |
1604 | */ | |
1605 | private static final class DocumentUsage { | |
1606 | /** | |
1607 | * This is the document id of the usage information. | |
1608 | */ | |
1609 | private final String documentId; | |
1610 | /** | |
1611 | * This is the use count of the usage information. | |
1612 | */ | |
1613 | private int usageCount; | |
1614 | /** | |
1615 | * This is the collection of document index collections the document is contained in. | |
1616 | */ | |
1617 | private Set collectionIds = new HashSet(); | |
1618 | ||
1619 | /** | |
1620 | * Creates a new DocumentUsage from the given arguments. | |
1621 | * | |
1622 | * @param documentId the document id for the instance | |
1623 | * @param usageCount the usage count for this instance | |
1624 | * | |
1625 | * @throws IllegalArgumentException in case the document id is <tt>null</tt> or empty String | |
1626 | */ | |
1627 | private DocumentUsage(final String documentId, final int usageCount) { | |
1628 | if (documentId == null) { | |
1629 | throw new IllegalArgumentException("The parameter named [documentId] was null."); | |
1630 | } | |
1631 | if (documentId.trim().length() == 0) { | |
1632 | throw new IllegalArgumentException("The parameter named [documentId] was an empty String."); | |
1633 | } | |
1634 | ||
1635 | this.documentId = documentId; | |
1636 | this.usageCount = usageCount; | |
1637 | } | |
1638 | ||
1639 | /** | |
1640 | * Creates a new DocumentUsage from the given argument. The use count is initialized to 0. | |
1641 | * | |
1642 | * @param documentId the document id for the instance | |
1643 | * | |
1644 | * @throws IllegalArgumentException in case the document id is <tt>null</tt> or empty String | |
1645 | */ | |
1646 | private DocumentUsage(final String documentId) { | |
1647 | this.documentId = documentId; | |
1648 | usageCount = 0; | |
1649 | } | |
1650 | ||
1651 | /** | |
1652 | * This method adds the given collection id to the document usage. | |
1653 | * | |
1654 | * @param collectionId the collection id to be removed | |
1655 | */ | |
1656 | private void add(final String collectionId) { | |
1657 | collectionIds.add(collectionId); | |
1658 | } | |
1659 | ||
1660 | /** | |
1661 | * This method removes the given collection id from the document usage. | |
1662 | * | |
1663 | * @param collectionId the collection id to be removed | |
1664 | */ | |
1665 | private void remove(final String collectionId) { | |
1666 | collectionIds.remove(collectionId); | |
1667 | } | |
1668 | ||
1669 | /** | |
1670 | * This method returns the collection ids contained in this document usage. | |
1671 | * | |
1672 | * @return the collection ids contained in this document usage | |
1673 | */ | |
1674 | private String[] getCollectionIds() { | |
1675 | return (String[]) collectionIds.toArray(new String[collectionIds.size()]); | |
1676 | } | |
1677 | ||
1678 | /** | |
1679 | * This method returns the document id of this document usage. | |
1680 | * | |
1681 | * @return the document id of this document usage | |
1682 | */ | |
1683 | private String getDocumentId() { | |
1684 | return documentId; | |
1685 | } | |
1686 | ||
1687 | /** | |
1688 | * This method returns the use count of this document usage. | |
1689 | * | |
1690 | * @return the use count of this document usage | |
1691 | */ | |
1692 | private int getUsageCount() { | |
1693 | return usageCount; | |
1694 | } | |
1695 | ||
1696 | /** | |
1697 | * This method increments the usage count of this document usage. | |
1698 | */ | |
1699 | private void incUsageCount() { | |
1700 | usageCount++; | |
1701 | } | |
1702 | ||
1703 | /** | |
1704 | * This method decrements the usage count of this document usage. | |
1705 | */ | |
1706 | private void decUsageCount() { | |
1707 | usageCount--; | |
1708 | } | |
1709 | } | |
1710 | } |
this report was generated by version 1.0.5 of jcoverage. |
copyright © 2003, jcoverage ltd. all rights reserved. |