कैमरे में रिवर्स शब्द एम्बेडिंग - पायथन

मैं कैमरे में चैटबॉट बनाने की कोशिश कर रहा हूं। मैं शब्दावली में प्रत्येक शब्द को अपनी आईडी निर्दिष्ट कर रहा हूं। एक प्रशिक्षण नमूना इस तरह दिखता है:कैमरे में रिवर्स शब्द एम्बेडिंग - पायथन

[0 0 0 0 0 0 32 328 2839 13 192 1 ] -> [23 3289 328 2318 12 0 0 0 0 0 0 0]

तो मैं आकार 32 की वैक्टर तब मैं छिपा परतों के रूप में LSTM परतों का उपयोग कर रहा हूँ में इन आईडी एम्बेड करने के Keras में एम्बेडिंग परत का उपयोग कर रहा हूँ। समस्या यह है कि मेरा आउटपुट एम्बेडेड आईडी की एक सूची है।

[ 0.16102183 0.1238187 0.1159694 0.13688719 0.12964118 0.12848872 0.13515817 0.13582146 0.16919741 0.15453722 ... ]

मैं इन embeddings वापस शब्दों को अपने मूल शब्दावली में परिवर्तित कर सकते हैं?

यहाँ मेरी कोड है:

from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer 
from keras.models import Sequential, load_model 
from keras.layers import LSTM 
from keras.layers.embeddings import Embedding 
from keras.preprocessing import sequence 

import os 

import numpy as np 
import cPickle as pickle 


class Chatbot(object): 

def __init__(self, h_layers=1): 
    # self.name = name 
    self.h_layers = h_layers 
    self.seq2seq = None 
    self.max_length = 0 
    self.vocabulary = {} 

@staticmethod 
def load(model_name): 
    with open('models/{}/chatbot_object.pkl'.format(model_name), 'rb') as pickle_file: 
     obj = pickle.load(pickle_file) 
    obj.seq2seq = load_model('models/{}/seq2seq.h5'.format(model_name)) 
    return obj 

def train(self, x_train, y_train): 
    count_vect = CountVectorizer() 
    count_vect.fit(x_train) 
    count_vect.fit(y_train) 

    self.vocabulary = count_vect.vocabulary_ 
    self.vocabulary.update({'<START>': len(self.vocabulary), 
          '<END>': len(self.vocabulary) + 1, 
          '<PAD>': len(self.vocabulary) + 2, 
          '<UNK>': len(self.vocabulary) + 3}) 

    for i in range(len(x_train)): 
     x_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(x_train[i])] + ['<END>'] 
    for i in range(len(y_train)): 
     y_train[i] = ['<START>'] + [w.lower() for w in word_tokenize(y_train[i])] + ['<END>'] 

    for sample in x_train: 
     if len(sample) > self.max_length: 
      self.max_length = len(sample) 
    for sample in y_train: 
     if len(sample) > self.max_length: 
      self.max_length = len(sample) 

    for i in range(len(x_train)): 
     x_train[i] = [self.vocabulary[w] for w in x_train[i] if w in self.vocabulary] 
    for i in range(len(y_train)): 
     y_train[i] = [self.vocabulary[w] for w in y_train[i] if w in self.vocabulary] 

    x_train = sequence.pad_sequences(x_train, maxlen=self.max_length, value=self.vocabulary['<PAD>']) 
    y_train = sequence.pad_sequences(y_train, maxlen=self.max_length, padding='post', 
            value=self.vocabulary['<PAD>']) 

    x_train = np.asarray(x_train) 
    y_train = np.asarray(y_train) 

    embedding_vector_length = 32 

    self.seq2seq = Sequential() 
    self.seq2seq.add(Embedding(len(self.vocabulary), embedding_vector_length, input_length=self.max_length)) 

    for _ in range(self.h_layers): 
     self.seq2seq.add(LSTM(self.max_length, return_sequences=True)) 

    self.seq2seq.add(LSTM(self.max_length)) 
    self.seq2seq.compile(loss='cosine_proximity', optimizer='adam', metrics=['accuracy']) 
    self.seq2seq.fit(x_train[:100], y_train[:100], epochs=5, batch_size=32) 

def save(self, filename): 
    if filename not in os.listdir('models'): 
     os.system('mkdir models/{}'.format(filename)) 
    self.seq2seq.save('models/{}/seq2seq.h5'.format(filename)) 
    self.seq2seq = None 
    with open('models/{}/chatbot_object.pkl'.format(filename), 'wb') as pickle_file: 
     pickle.dump(self, pickle_file) 

def respond(self, text): 
    tokens = ['<START>'] + [w.lower() for w in word_tokenize(text)] + ['<END>'] 
    for i in range(len(tokens)): 
     if tokens[i] in self.vocabulary: 
      tokens[i] = self.vocabulary[tokens[i]] 
     else: 
      tokens[i] = self.vocabulary['<PAD>'] 
    x = sequence.pad_sequences([tokens], maxlen=self.max_length, value=self.vocabulary['<PAD>']) 
    prediction = self.seq2seq.predict(x, batch_size=1) 
    return prediction[0]

स्रोत

2017-08-19 Noah Chalifour

मैं इसका जवाब भी नहीं मिल सकता है, तो मैं एक देखने समारोह लिखा था।

def lookup(tokenizer, vec, returnIntNotWord=True): 
    twordkey = [(k, tokenizer.word_index[k]) for k in sorted(tokenizer.word_index, key=tokenizer.word_index.get, reverse=False)] 
    oneHotVec = [] #captures the index of the ords 
    engVec = [] #this one returns the indexs and the words. Make sure returnIntNotWord is false though 
    for eachRow, notUsed in enumerate(vec): 
     for index, item in enumerate(vec[0]): 
      if vec[eachRow][index] == 1: 
       oneHotVec.append(index) 
    for index in oneHotVec: 
     engVec.append(twordkey[index]) 
    if returnIntNotWord == True: 
     return oneHotVec 
    else: 
     return engVec

टोकनज़र केरा टोकनाइज़र है।
वीईसी एक-हॉट एन्कोडेड लेबल का 2 डी वेक्टर है
ReturnIntNotWord, यह टिप्पणियों में है।

स्रोत

2018-02-26 15:35:53 Definity

कैमरे में रिवर्स शब्द एम्बेडिंग - पायथन

उत्तर

संबंधित मुद्दे