wiki:ApertureDataCrawler

Version 7 (modified by dburkhar, 18 years ago) (diff)

--

Data Crawler

Java Interface

Probably equal to:

and probably merge biz.aduna.datasource.crawler.DataCrawlerListener with source:/trunk/gnoDesktopSearch/src/java/org/gnowsis/desktopsearch/crawler/CrawlerListener.java

/**
 * A DataCrawler accesses the physical source represented by a DataSource
 * and delivers a stream of DataObjects representing the individual items
 * in that source.  
 */
public interface DataCrawler {
	
        /**
         * Method from Gnowsis - StructuredAdapter
         *
         * The data inside the adapter may have a "root" url from which it is possible
         * to reach other urls through the links graph. This is the same function as 
         * in GenericAdapter, but this time you must return a set of root urls
         * that point to containers.
         * @return an array of strings 
         */
        public String[] getRootUrls();
    
	
        /**
         * Method from Gnowsis - StructuredAdapter
         *
         * list the sub-containers of the passed container.
         * This returns a RDF container that represents the containerUri but
         * contains only other containers.
         * @param containerUri the uri of the container
         * @return null if the container cannot be found, otherwise an Container with Containers
         */
        public Container listSubContainers(String containerUri);
       
        /**
         * Method from Gnowsis - StructuredAdapter
         *
         * list the resources in the passed container.
         * This returns a RDF container that represents the containerUri but
         * contains only the resources/documents/files/emails in the container.
         * You have to describe each container item (which has to have a unique uri in the domain of the adapter) 
         * resource using exactly these properties:
         * <ul>
         * <li>rdf:type</li>
         * <li>rdfs:label</li>
         * <li>dcterms:modified - Date on which the resource was changed. SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss");</li>
         * <li>FUTURE!!! optional: x  <rdf:type> <rdfs:Container>. when this statement is given, the resource can contain sub-resources. this happens in email stores or with zip files. 
         * </ul>
         * Each resource must have a URI that can be used in a getCBD call. 
         * 
         * @param containerUri the uri of the container
         * @return null if the container cannot be found, otherwise an Container with Resources
         */
        public Container listResources(String containerUri);




	/**
         * Constants used to indicate why a scan was stopped.
         */
	public static enum ExitCode {
		COMPLETED,        // the scan procedure terminated normally
		BY_REQUEST,       // the DataCrawler was requested to abort the scan procedure
		FATAL_EXCEPTION   // an error occurred that made further scanning impossible
	};
	
        /**
         * Returns the DataSource crawler by this DataCrawler.
         */
        public DataSource getDataSource();
    
	/**
	 * Starts a scan for DataObjects over the configured domain defined
         * in the DataSource. If this is not the first run of this DataCrawler,
         * it will only report the differences with the previous run, unless the
         * previous scan results have been cleared.
	 **/
	public void scan();

	/**
	 * Stops a running scan operation as fast as possible. This method
         * may return before the operation has actually been stopped.
	 **/
	public void stopScanning();

	/**
	 * Clears all stored scan results. Any listeners registered with
	 * this data source will be notified of the removal of the data
	 * objects. The next call to scan() will again report all
	 * data objects in the configured domain.
	 **/
	public void clearScanResults();

	/**
	 * Gets the ScanReport of the last performed scan, or the current
	 * scan when a scan is in progress. Returns null when
	 * no scan was performed in this session and there is no scan report
	 * available from the previous session.
	 * 
	 * @return The ScanReport of the last session, or null when
	 * this is not available.
	 **/
	public ScanReport getLastScanReport();
	
	/**
	 * Adds a DataSourceListener to which this data source should
	 * report any scanned or cleared data objects.
	 *
	 * @param listener The DataCrawlerListener to add.
	 **/
	public void addListener(DataCrawlerListener listener);

	/**
	 * Removes a DataSourceListener from this data source.
	 *
	 * @param listener The DataCrawlerListener to remove.
	 **/
	public void removeListener(DataCrawlerListener listener);
}