indexText: added allElementsOf

agocorona · Apr 24, 2013 · 304eb8d · 304eb8d
1 parent 80d4f04
commit 304eb8d
Show file tree

Hide file tree

Showing 2 changed files with 66 additions and 12 deletions.
diff --git a/Data/TCache.hs b/Data/TCache.hs
@@ -229,6 +229,49 @@ gives:
 ,addTrigger
 
 -- * Cache control
+{-- |
+
+The mechanism for dropping elements from the cache is too lazy. `flushDBRef`, for example
+just delete the data element  from the TVar, but the TVar node
+remains attached to the table so there is no decrement on the number of elements.
+The element is garbage collected unless you have a direct reference to the element, not the DBRef
+Note that you can still have a valid reference to this element, but this element  is no longer
+in the cache. The usual thing is that you do not have it, and the element will be garbage
+collected (but still there will be a NotRead entry for this key!!!). If the DBRef is read again, the
+TCache will go to permanent storage to retrieve it.
+
+clear opertions such `clearsyncCache` does something similar:  it does not delete the
+element from the cache. It just inform the garbage collector that there is no longer necessary to maintain
+the element in the cache. So if the element has no other references (maybe you keep a
+variable that point to that DBRef) it will be GCollected.
+If this is not possible, it will remain in the cache and will be treated as such,
+until the DBRef is no longer referenced by the program. This is done by means of a weak pointer
+
+All these complications are necessary because the programmer can  handle DBRefs directly,
+so the cache has no complete control of the DBRef life cycle, short to speak.
+
+a DBRef can be in the states:
+
+- `Exist`:  it is in the cache
+
+- `DoesNotExist`: neither is in the cache neither in storage: it is like a cached "notfound" to
+speed up repeated failed requests
+
+- `NotRead`:  may exist or not in permanent storage, but not in the cache
+
+
+In terms of Garbage collection it may be:
+
+
+
+1 - pending garbage collection:  attached to the hashtable by means of a weak pointer: delete it asap
+
+2 - cached: attached by a direct pointer and a weak pointer: It is being cached
+
+
+clearsyncCache just pass elements from 2 to 1
+
+--}
 ,flushDBRef
 ,flushKey
 ,invalidateKey
@@ -530,7 +573,7 @@ onNothing io onerr= do
 flushDBRef ::  (IResource a, Typeable a) =>DBRef a -> STM()
 flushDBRef (DBRef _ tv)=   writeTVar  tv  NotRead
 
--- flush the element with the given key
+-- | flush the element with the given key
 flushKey key=  do
    (cache,time) <- unsafeIOToSTM $ readIORef refcache
    c <- unsafeIOToSTM $ H.lookup cache key
@@ -542,7 +585,7 @@ flushKey key=  do
             Nothing -> unsafeIOToSTM (finalize w)  >> flushKey key
        Nothing   -> return ()
 
--- label the object as not existent in database
+-- | label the object as not existent in database
 invalidateKey key=  do
    (cache,time) <- unsafeIOToSTM $ readIORef refcache
    c <- unsafeIOToSTM $ H.lookup cache key
@@ -758,6 +801,7 @@ updateListToHash hash kv= mapM (update1 hash) kv where
 
 
 -- | Start the thread that periodically call `clearSyncCache` to clean and writes on the persistent storage.
+-- it is indirecly set by means of `syncWrite`, since it is more higuer level. I recommend to use the latter
 -- Otherwise, 'syncCache' or `clearSyncCache` or `atomicallySync` must be invoked explicitly or no persistence will exist.
 -- Cache writes allways save a coherent state
 clearSyncCacheProc ::
@@ -802,8 +846,6 @@ data SyncMode= Synchronous   -- ^ sync state to permanent storage when `atomical
 tvSyncWrite= unsafePerformIO $ newIORef  (Synchronous, Nothing)
 
 -- | Specify the cache synchronization policy with permanent storage. See `SyncMode` for details
--- 
-
 syncWrite::  SyncMode -> IO()
 syncWrite mode= do
      (_,thread) <- readIORef tvSyncWrite

diff --git a/Data/TCache/IndexText.hs b/Data/TCache/IndexText.hs
@@ -54,7 +54,7 @@ main= do
 
 -}
 
-module Data.TCache.IndexText(indexText, indexList,  contains, containsElem) where
+module Data.TCache.IndexText(indexText, indexList,  contains, containsElem, allElemsOf) where
 import Data.TCache
 import Data.TCache.IndexQuery
 import Data.TCache.Defs
@@ -141,15 +141,15 @@ indexText
      => (a -> b)      -- ^ field to index
      -> (b -> T.Text) -- ^ method to convert the field content to lazy Text (for example `pack` in case of String fields). This permits to index non Textual fields
      -> IO ()
-indexText sel convert= addTrigger (indext sel  (words1 . convert)) where
+indexText sel convert= addTrigger (indext sel  (words1 . convert))
 
 -- | trigger the indexation of list fields with elements convertible to Text
 indexList
   :: (IResource a, Typeable a, Typeable b)
      => (a -> b)      -- ^ field to index
      -> (b -> [T.Text]) -- ^ method to convert a field element to Text (for example `pack . show` in case of elemets with Show instances)
      -> IO ()
-indexList sel convert= addTrigger (indext sel  convert) where
+indexList sel convert= addTrigger (indext sel  convert)
 
 
 indext :: (IResource a, Typeable a,Typeable b)
@@ -200,9 +200,21 @@ containsElem  sel wstr = do
             let wordsr = catMaybes $ map (\n -> M.lookup n mmapIntString) $ catMaybes mns
             return $ map getDBRef wordsr
 
-words1= filter ( (<) 2 . T.length) . T.split (\c -> isSeparator c || c=='\n' || isPunctuation c )
+allElemsOf :: (IResource a, Typeable a, Typeable b) => (a -> b) -> STM [T.Text]
+allElemsOf  sel  = do
+    let [t1, t2]=  typeRepArgs $! typeOf sel
+    let t=  show t1 ++ show t2
+    let u= undefined
+    mr <- withSTMResources [IndexText t u u u u]
+       $ \[r] -> resources{toReturn= r}
+    case mr of
+      Nothing -> return []
+      Just (IndexText t n _ _ map) -> return $ M.keys map
+
+words1= filter filterWordt {-( (<) 2 . T.length)-} . T.split (\c -> isSeparator c || c=='\n' || isPunctuation c )
 
--- | return the DBRefs whose fields include all the words of length three or more in the requested text contents
+-- | return the DBRefs whose fields include all the words in the requested text contents.Except the
+-- words  with less than three characters that are not digits or uppercase, that are filtered out before making the query
 contains
   :: (IResource a, Typeable a, Typeable b)
   =>( a -> b)      -- ^ field to search in
@@ -212,8 +224,8 @@ contains sel str= case  words str of
      [] -> return []
      [w] -> containsElem sel w
      ws  -> do
-        let rs = map (containsElem sel) $ filter (\t -> length t >2) ws
+        let rs = map (containsElem sel) $ filter filterWord ws
         foldl (.&&.) (head rs)  (tail rs)
 
-
-
+filterWordt w= T.length w >2 || or (map (\c -> isUpper c || isDigit c) (T.unpack w))
+filterWord w= length w >2 || or (map (\c -> isUpper c || isDigit c) w)