overwrite existing entity via bulkloader.Loader - google-app-engine

I was going to CSV based export/import for large data with app engine. My idea was just simple.
First column of CSV would be key of entity.
If it's not empty, that row means existing entity and should overwrite old one.
Else, that row is new entity and should create new one.
I could export key of entity by adding key property.
class FrontExporter(bulkloader.Exporter):
def __init__(self):
bulkloader.Exporter.__init__(self, 'Front', [
('__key__', str, None),
('name', str, None),
])
But when I was trying to upload CSV, it had failed because bulkloader.Loader.generate_key() was just for "key_name" not "key" itself. That means all exported entities in CSV should have unique 'key_name' if I want to modify-and-reupload them.
class FrontLoader(bulkloader.Loader):
def __init__(self):
bulkloader.Loader.__init__(self, 'Front', [
('_UNUSED', lambda x: None),
('name', lambda x: x.decode('utf-8')),
])
def generate_key(self,i,values):
# first column is key
keystr = values[0]
if len(keystr)==0:
return None
return keystr
I also tried to load key directly without using generate_key(), but both failed.
class FrontLoader(bulkloader.Loader):
def __init__(self):
bulkloader.Loader.__init__(self, 'Front', [
('Key', db.Key), # not working. just create new one.
('__key__', db.Key), # same...
So, how can I overwrite existing entity which has no 'key_name'? It would be horrible if I should give unique name to all entities.....
From the first answer, I could handle this problem. :)
def create_entity(self, values, key_name=None, parent=None):
# if key_name is None:
# print 'key_name is None'
# else:
# print 'key_name=<',key_name,'> : length=',len(key_name)
Validate(values, (list, tuple))
assert len(values) == len(self._Loader__properties), (
'Expected %d columns, found %d.' %
(len(self._Loader__properties), len(values)))
model_class = GetImplementationClass(self.kind)
properties = {
'key_name': key_name,
'parent': parent,
}
for (name, converter), val in zip(self._Loader__properties, values):
if converter is bool and val.lower() in ('0', 'false', 'no'):
val = False
properties[name] = converter(val)
if key_name is None:
entity = model_class(**properties)
#print 'create new one'
else:
entity = model_class.get(key_name)
for key, value in properties.items():
setattr(entity, key, value)
#print 'overwrite old one'
entities = self.handle_entity(entity)
if entities:
if not isinstance(entities, (list, tuple)):
entities = [entities]
for entity in entities:
if not isinstance(entity, db.Model):
raise TypeError('Expected a db.Model, received %s (a %s).' %
(entity, entity.__class__))
return entities
def generate_key(self,i,values):
# first column is key
if values[0] is None or values[0] in ('',' ','-','.'):
return None
return values[0]

Your best option is probably to override create_entity. You'll need to copy most of the existing code there, but modify the constructor to supply a key argument instead of a key_name argument.

Related

How to build an array of Objects in a loop

I'm new with Python but i'm a Powershell user so maybe what i'm trying to do is not possible the same way in Python
In Python 3 to learn i'm trying to make a list of the files in a directory and store it into a indexstore variable.
To do that this is what i done :
i created 2 objects Index and Indexstore
class Index(object):
def __init__(self, filepath, size):
self.filepath = filepath
self.size = size
and
class IndexStore(object):
def __init__(self, filepath, size):
self.filepath = filepath
self.size = size
after that i get my filelist from a location on my HDD
listOfFile = os.listdir(SourcePath)
With with list i'm starting a loop where i get the fullpath and the size of the file ( Like ForEach ) in 2 variables fullPath and fileSize
fullPath = os.path.join(SourcePath, entry)
fileSize: int = os.path.getsize(fullPath)
With the values i set the Index Object
setattr(Index, 'filepath', fullPath)
setattr(Index, 'size', fileSize)
And it's working with
pprint(vars(Index))
i have the result
mappingproxy({'dict': <attribute 'dict' of 'Index' objects>,
'doc': None,
'init': <function Index.init at 0x00000271B9C7E940>,
'module': 'main',
'repr': <property object at 0x00000271B9F30590>,
'weakref': <attribute 'weakref' of 'Index' objects>,
'filepath': 'D:\AmigaForever\AmigaOS3.9.ISO',
'size': 28862259})
After that is my problem ! In Powershell if i want to add a object2 to my objectlist1 i just do Objectlist1 += object2 and the work is done but in Python 3.x i tried many things on forums without success best way seems to be :
IndexStore = []
IndexStore.append(Index(fullPath, fileSize))
But the variable Indexstore stay Empty and if i try to print it
print(IndexStore)
pprint(vars(IndexStore))
the run console say :
print(IndexStore)
TypeError: 'tuple' object is not callable
Can you help me please ? Do i'm checking the value of my Indexstore well ?
Or my error is how i'm appending the values ?
I want in a second loop use again the values of the Object array to continue my code.
With the goal of 'Using Python 3 to make a list of the files in a directory and store it into a indexstore variable'.
The first problem I see is that you create a class Indexstore but later completely obviate the class when you assign the variable Indexstore = [].
so given you have a valid list of files from:
listOfFile = os.listdir(SourcePath)
This is an approach that will work:
First build an IndexItem class:
class IndexItem:
def __init__(self, filepath, size):
self.filepath = filepath
self.size = size
def __repr__(self):
# Returns a string representation of the IindexItem
return f"({self.filepath}, {self.size})"
This class has an intialization method which serves to preserve the instantiation values passed during initialization and a repr method which is used to convert the index values into readable text.
Next we create the IndexStore Class as follows:
class IndexStore:
def __init__(self):
self._index = []
def append(self, o: object):
# Append an entry onto the index
self._index.append(o)
def __repr__(self):
# Returns a string representation of self._index
return ', '.join(list(str(x) for x in self._index))
This class includes an instantiation which creates a list to hold the indexItems passed to it, and append method to add IndexItems to the IndexStore and finally a repr to create a readable string of the values.
Finally, we implement the basic functionality required to build the IndexStore as follows:
listOfFile = os.listdir(sourcePath)
index = IndexStore()
for f in listOfFile[:5]:
# For each entry f in listOfFile
fullpath = os.path.join(sourcePath, f)
# add an instantiation of IndexItem to IndexStore
index.append(IndexItem(fullpath, int(os.path.getsize(fullpath))))
print(index)
A simpler and more direct approach to this problem makes use of inherent Python bult-in data structures and capabilities is as follows:
IndexStore = []
listOfFile = os.listdir(sourcePath)
for f in listOfFile[:5]:
# For each entry f in listOfFile
fullpath = os.path.join(sourcePath, f)
# add an instantiation of IndexItem to IndexStore
IndexStore.append((fullpath, int(os.path.getsize(fullpath)))
print(IndexStore)
In this approach, the class definitions are eliminated, and the IndexStore contains a list of tuples with each tuple containing the fullpath to the file and it's size

How to create a completely (uniformly) random dataset on PyTorch

I need to run some experiments on custom datasets using pytorch. The question is, how can I create a dataset using torch.Dataloader?
I have two lists, one is called Values and has a datapoint tensor at every entry, and the other one is called Labels, that has the corresponding label. What I did is the following:
for i in range(samples):
dataset[i] = [values[i],labels[I]]
So I have a list with datapoint and respective label, and then tried the following:
dataset = torch.tensor(dataset).float()
dataset = torch.utils.data.TensorDataset(dataset)
data_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=100, shuffle=True, num_workers=4, pin_memory=True)
But, first of all, I get the error "Not a sequence" in the torch.tensor command, and second, I'm not sure this is the right way of creating one. Any suggestion?
Thank you very much!
You do not need to overload DataLoader, but rather create a Dataset for your data.
For instance,
class MyDataset(Dataset):
def __init__(self):
super(MyDataset, self).__init__()
# do stuff here?
self.values = values
self.labels = labels
def __len__(self):
return len(self.values) # number of samples in the dataset
def __getitem__(self, index):
return self.values[index], self.labels[index]
Just to enrich the answer by #shai
class MyDataset(Dataset):
def __init__(self, values):
super(MyDataset, self).__init__()
self.values = values
def __len__(self):
return len(self.values)
def __getitem__(self, index):
return self.values[index]
values = np.random.rand(51000, 3)
dataset = MyDataset(values)

Slick MSSQL inserting object with auto increment

I've recently had to move a project over from MySQL to MSSQL. I'm using IDENTITY(1,1) on my id columns for my tables to match MySQL's auto-increment feature.
When I try to insert an object though, I'm getting this error:
[SQLServerException: Cannot insert explicit value for identity column in table 'categories' when IDENTITY_INSERT is set to OFF.]
Now after some research I found out that it's because I'm trying to insert a value for my id(0) on my tables. So for example I have an object Category
case class Category(
id: Long = 0L,
name: String
)
object Category extends Table[Category]("categories"){
def name = column[String]("name", O.NotNull)
def id = column[Long]("id", O.PrimaryKey, O.AutoInc)
def * = id ~ name <> (Category.apply _, Category.unapply _)
def add(model:Category) = withSession{ implicit session =>
Category.insert(model)
}
def remove(id:Long) = withSession{implicit session =>
try{Some(Query(Category).filter(_.id === id).delete)}
catch{case _ => None}
}
}
Is there a way to insert my object into the database and ignoring the 0L without MSSQL throwing an SQLException? MySQL would just ignore the id's value and do the increment like it didn't receive an id.
I'd really rather not create a new case class with everything but the id.
Try redefining your add method like this and see if it works for you:
def add(model:Category) = withSession{ implicit session =>
Category.name.insert(model.name)
}
If you had more columns then you could have added a forInsert projection to your Category table class that specified all fields except id, but since you don't, this should work instead.
EDIT
Now if you do have more than 2 fields on your table objects, then you can do something like this, which is described in the Lifted Embedding documentation here:
case class Category(
id: Long = 0L,
name: String,
foo:String
)
object Category extends Table[Category]("categories"){
def id = column[Long]("id", O.PrimaryKey, O.AutoInc)
def name = column[String]("name", O.NotNull)
def foo = column[String]("foo", O.NotNull)
def * = id ~ name ~ foo <> (Category.apply _, Category.unapply _)
def forInsert = name ~ foo <> (t => Category(0L, t._1, t._2), {(c:Category) => Some(c.name, c.foo)})
def add(model:Category) = withSession{ implicit session =>
Category.forInsert insert model
}
def remove(id:Long) = withSession{implicit session =>
try{Some(Query(Category).filter(_.id === id).delete)}
catch{case _ => None}
}
def withSession(f: Session => Unit){
}
}

ndb get & get_or_insert how to use ? (alway raise Exception)

I write code as below
from google.appengine.ext import ndb
__metaclass__ = type
class UserSession(ndb.Model):
session = ndb.BlobProperty()
class KV:
#staticmethod
def get(id):
r = ndb.Key(UserSession, int(id)).get()
if r:
return r.session
#staticmethod
def set(id, value):
return UserSession.get_or_insert(int(id), session=value)
#staticmethod
def delete(id):
ndb.Key(UserSession, int(id)).delete()
where I write
id = 1
key = ndb.Key(UserSession, int(id))
UserSession.get_or_insert(key, session=1)
the sdk raise
TypeError: name must be a string; received Key('UserSession', 1)
when I call KV.get ()
the sdk raise
File "/home/bitcoin/42btc/zapp/_plugin/auth/model/gae/user.py", line 14, in get
r = ndb.Key(UserSession,int(id)).get()
...
BadRequestError: missing key id/name
So , how to use NDB?
The get_or_insert() method takes a string which is only the ID part of the key, not a Key. It cannot use numeric IDs.

bulk update/delete entities of different kind in db.run_in_transaction

Here goes pseudo code of bulk update/delete entities of different kind in single transaction. Note that Album and Song entities have AlbumGroup as root entity. (i.e. has same parent entity)
class Album:
pass
class Song:
album = db.ReferenceProperty(reference_class=Album,collection_name="songs")
def bulk_update_album_group(album):
updated = [album]
deleted = []
for song in album.songs:
if song.is_updated:
updated.append(song)
if song.is_deleted:
deleted.append(song)
db.put(updated)
db.delete(deleted)
a = Album.all().filter("...").get()
# bulk update/delete album.
db.run_in_transaction(bulk_update_album,a)
But I met a famous "Only Ancestor Queries in Transactions" error at the iterating back-reference properties like "album.songs". I guess ancestor() filter does not help because those entities are modified in memory.
So I modify example like this: prepare all updated/deleted entities before calling transaction.
def bulk_update_album2(album):
updated = [album]
deleted = []
for song in album.songs:
if song.is_updated:
updated.append(song)
if song.is_deleted:
deleted.append(song)
def txn(updated,deleted):
db.put(updated)
db.delete(deleted)
db.run_in_transaction(txn,updated,deleted)
Now I found that iterating back-reference property force reload existing entities. So re-iterating back-reference property after modifying should be avoided!!
All I want to verify is:
When need to bulk update/delete many entities of different kind,
is there any good coding pattern for this situation?
my last code can be good one?
Here goes full code example:
from google.appengine.ext import webapp
from google.appengine.ext.webapp import util
import logging
from google.appengine.ext import db
class Album(db.Model):
name = db.StringProperty()
def __repr__(self):
return "%s%s"%(self.name,[song for song in self.songs])
class Song(db.Model):
album = db.ReferenceProperty(reference_class=Album,collection_name='songs')
name = db.StringProperty()
playcount = db.IntegerProperty(default=0)
def __repr__(self):
return "%s(%d)"%(self.name,self.playcount)
def create_album(name):
album = Album(name=name)
album.put()
for i in range(0,5):
song = Song(parent=album, album=album, name='song#%d'%i)
song.put()
return album
def play_all_songs(album):
logging.info(album)
# play all songs
for song in album.songs:
song.playcount += 1
logging.info(song)
# play count also 0 here
logging.info(album)
def save_play_count(album):
updated = []
for song in album.songs:
updated.append(song)
db.put(updated)
db.run_in_transaction(save_play_count,album)
def play_all_songs2(album):
logging.info("loading : %s"%album)
# play all songs
updated = []
for song in album.songs:
song.playcount += 1
updated.append(song)
logging.info("updated: %s"%updated)
db.put(updated)
logging.info("after save: %s"%album)
def play_all_songs3(album):
logging.info("loading : %s"%album)
# play all songs
updated = []
for song in album.songs:
song.playcount += 1
updated.append(song)
# reload
for song in album.songs:
pass
logging.info("updated: %s"%updated)
def bulk_save_play_count(updated):
db.put(updated)
db.run_in_transaction(bulk_save_play_count,updated)
logging.info("after save: %s"%album)
class MainHandler(webapp.RequestHandler):
def get(self):
self.response.out.write('Hello world!')
album = Album.all().filter('name =','test').get()
if not album:
album = db.run_in_transaction(create_album,'test')
# BadRequestError: Only ancestor queries are allowed inside transactions.
#play_all_songs(album)
# ok
#play_all_songs2(album)
play_all_songs3(album)
def main():
application = webapp.WSGIApplication([('/', MainHandler)],
debug=True)
util.run_wsgi_app(application)
if __name__ == '__main__':
main()
Please note the the ReferenceProperty is not enough to put the entities in the same group. When you create a Song model you should pass a parent argument with the model's parent (e.g., the Album).
It looks like this:
album = Album.all().filter("...").get()
new_song = Song(name='Seven Nation Army', parent=album)
new_song.save()
See the documentation about ancestors.

Resources