Detecting anomalies from video feeds using a Spatiotemporal Autoencoder Starting out with this project, I was confused between a regular classification task and an anomaly detection. After all anomaly detection should fall under a one-class(binary) classification However, I realized that collecting data for anomaly detection was the pain-point
It's impossible to get equal data points for anomalies and non-anomalies since anomalies are extreme events For this projecct I use the UCSD Anomaly Detection Dataset It contains two parts: As per the site description: Peds1:clips of groups of people walking towards and away from the camera, and some amount of perspective distortion. Contains 34 training video samples and 36 testing video samples> Peds2:scenes with pedestrian movement parallel to the camera plane. Contains 16 training video samples and 12 testing video samples.
Steps to prepare data: After a thourough literaure survey I came across handcrafted feature selections.. some of which include However, following "Learning Temporal Regularity in Video Sequences
Mahmudul Hasan Jonghyun Choi†
Jan Neumann† Amit K. Roy-Chowdhury Larry S. Davis" it has been proven empirically that even though we use the state-of-the-art motion feature
descriptors, they may not be optimal for learning regular patterns in videos. As per the paper I decided to use the input frames as it is and supply it to a Deep neural net acrhitecture. Without much success using regular auto-encoder I decided to use temproal sequences of input frames to learn temporal features as well. The input sequence consists of 'T' frames stacked together where 'T' is the sequence len. In order to apply data augmentation to increase volume of input temporal sampling augmentation was applied by concatenating i/p frames with various skipping strides to construct T-sized input cuboid. Three types of cuboids from the video sequences were sampled by using stride-1, stride-2, and stride-3. In order to find a suitable architecture I created a vanilla sptial auto-encoder.
Individual frames were supplied to the model and the mteric used to identify anomaly was the reconstruction error (L2 norm of model output and input frame[Euclidean distance]) In order to improve the accuracy a spatio-temporal autoencoder was implemented following the paper "Abnormal Event Detection in Videos
using Spatiotemporal Autoencoder" The structure is as follows:
Based on the reconstruction error a threshold can be set to determine how sensitive the detections are to anomalies (Regularity score) Once the ground truth can be classified manually into anomaly/non-anomaly frames we can use the area under the receiver operating characteristic (ROC) curve (AUC) to find the perfect threshold (I coudn't get my hands on manually classified frames) For training I used Adam optimizer to allow it taking the role of setting the learning rate automatically based on the models weight update history Regularity score:Anomaly Detection using Spatiotemporal Autoencoder
References
Data:
# from tensorflow.compat.v1 import ConfigProto
# from tensorflow.compat.v1 import InteractiveSession
# config = ConfigProto()
# config.gpu_options.allow_growth = True
# session = InteractiveSession(config=config)
import numpy as np
import math
import tensorflow as tf
print(tf.__version__)
if tf.test.gpu_device_name():
print('Default GPU Device: {}'.format(tf.test.gpu_device_name()))
else:
print("Please install GPU version of TF")
AUTO = tf.data.experimental.AUTOTUNE
path = '/media/orbo-dl/hdd/orbo_projects/anomaly_detection/'
# path = '/content/'
%cd $path
'''
Anomaly Detection in Crowded Scenes.
V. Mahadevan, W. Li, V. Bhalodia and N. Vasconcelos.
In Proc. IEEE Conference on Computer Vision and Pattern Recognition (CVPR),
San Francisco, CA, 2010
'''
import glob
import os
import tarfile
!wget 'http://www.svcl.ucsd.edu/projects/anomaly/UCSD_Anomaly_Dataset.tar.gz'
tar = tarfile.open("UCSD_Anomaly_Dataset.tar.gz")
tar.extractall()
tar.close()
BATCH_SIZE = 2
INPUT_SHAPE = 144
LATENT_DIM = 2048
TRAIN_PATH = 'UCSD_Anomaly_Dataset.v1p2/UCSDped1/Train'
TEST_PATH = path + '/UCSD_Anomaly_Dataset.v1p2/UCSDped1/Test/Test032'
import numpy as np
import cv2
# from google.colab.patches import cv2_imshow
def add_noise(img):
'''source: https://gist.github.com/Prasad9/28f6a2df8e8d463c6ddd040f4f6a028a'''
noise = np.random.normal(loc=0, scale=1, size=img.shape)
img2 = img*2
noisey_image = np.clip(np.where(img2 <= 1, (img2*(1 + noise*0.2)), (1-img2+1)*(1 + noise*0.2)*-1 + 2)/2, 0,1)
return noisey_image
import glob
import os
import cv2
import numpy as np
def temporal_augmentation(path, total_strides=3, seq_len=10, noise=False):
'''
Data augmentation in the temporal dimension
e.g.stride-1 sequence is made up of frames {1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
stride-2 sequence is made up of frames {1, 3, 5, 7, 9, 11, 13, 15, 17, 19},
stride-3 sequence is made up of frames {1, 4, 7, 10, 13, 16, 19, 22, 25,28} and so on
@args:
path (str): path to train data (consists of folders (which consists of frames) of ~2 min videos)
stride (int): apply stride in temporal sampling
seq_len (int): len of the input sequence to the model e.g. batch_sizex10x28x28x3
'''
sequence_frames = []
for idx, folder in enumerate(glob.glob(path+"/*"), 1): #Train000,Train001,..
assert os.path.isdir(folder)==True, "[ERROR] not a folder"
frames_in_folder = []
for frame in sorted(glob.glob(folder+'/*')):
frame = cv2.imread(frame, 0)
frame = cv2.resize(frame, (INPUT_SHAPE, INPUT_SHAPE))
frame = frame/255.
if noise:
frame = add_noise(frame)
frame = np.expand_dims(frame, axis=-1)
frames_in_folder.append(frame)
print(len(frames_in_folder))
temp_seq = []
for stride in range(1, total_strides):
# temp_seq = [frames_in_folder[i:seq_len*stride:stride] for i in range(len(frames_in_folder))]
for i in range(0, len(frames_in_folder), seq_len):
# print('*********')
# print(i,seq_len*stride,stride)
temp_arr = frames_in_folder[i:i+seq_len*stride:stride]
temp_arr = np.array(temp_arr)
# temp_arr = np.expand_dims(temp_arr, axis=-1)
# print(temp_arr.shape)
if temp_arr.shape[0]==seq_len:
temp_seq.append(temp_arr)
print(f'[INFO] processed {folder}')
sequence_frames.extend(temp_seq)
sequence_frames = np.array(sequence_frames)
print(f'[INFO] generated the sequence with shape {sequence_frames.shape}')
return sequence_frames
data = temporal_augmentation(TRAIN_PATH)
# noisy_data = temporal_augmentation(TRAIN_PATH, noise=True)
training_data = data
training_data.shape
import cv2
import numpy as np
# for idx, filename in enumerate(files):
# img = cv2.imread(filename, 0)
# img = cv2.resize(img, (INPUT_SHAPE,INPUT_SHAPE))
# a[idx,:,:, 0] = np.array(img, dtype=np.float32)/255.0
import tensorflow as tf
# data = tf.convert_to_tensor(data, dtype=tf.float32)
def make_two(a):
return a, a
dataset = tf.data.Dataset.from_tensor_slices(training_data)
dataset = dataset.map(make_two)
dataset = dataset.shuffle(2048)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(buffer_size=BATCH_SIZE)
# del training_data
iter(dataset).__next__()[0].shape
Model identification and Metricsimport cv2
import numpy as np
import glob
files = glob.glob(TRAIN_PATH+'/*/*')
len(files)
data = np.zeros((len(files),INPUT_SHAPE,INPUT_SHAPE, 1))
for idx, filename in enumerate(files):
img = cv2.imread(filename, 0)
img = cv2.resize(img, (INPUT_SHAPE,INPUT_SHAPE))
img = img/255.
data[idx,:,:, 0] = np.array(img, dtype=np.float32)
noisy_data = np.zeros((len(files),INPUT_SHAPE,INPUT_SHAPE, 1))
for idx, filename in enumerate(files):
img = cv2.imread(filename, 0)
img = cv2.resize(img, (INPUT_SHAPE,INPUT_SHAPE))
img = img/255.
img = add_noise(img)
noisy_data[idx,:,:, 0] = np.array(img, dtype=np.float32)
training_data = np.concatenate((data, noisy_data), axis=0)
import tensorflow as tf
data = tf.convert_to_tensor(data, dtype=tf.float32)
def make_two(a):
return a, a
BATCH_SIZE = 16
dataset = tf.data.Dataset.from_tensor_slices(training_data)
dataset = dataset.map(make_two)
dataset = dataset.shuffle(2048)
dataset = dataset.batch(BATCH_SIZE)
dataset = dataset.prefetch(AUTO)
iter(dataset).__next__()[0].shape
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Conv2D, Conv2DTranspose
from tensorflow.keras.layers import Input, AveragePooling2D, UpSampling2D
def st_ae():
channel_num = 1
input_tensor = Input(shape=(INPUT_SHAPE, INPUT_SHAPE, channel_num))
conv1 = Conv2D(128, kernel_size=(7, 7), padding='same', name='conv1', activation='relu')(input_tensor)
conv1 = AveragePooling2D((2,2), padding='same')(conv1)
conv2 = Conv2D(64, kernel_size=(3, 3), padding='same', strides=(2, 2), name='conv2', activation='relu')(conv1)
conv2 = AveragePooling2D((2, 2), padding='same')(conv2)
conv3 = Conv2D(32, kernel_size=(3, 3), padding='same', strides=(1, 1), name='conv3', activation='relu')(conv2)
conv3 = AveragePooling2D((2, 2), padding='same')(conv3)
deconv1 = Conv2D(32, kernel_size=(3, 3), padding='same', strides=(1, 1), name='deconv1', activation='relu')(conv3)
deconv1 = UpSampling2D((2, 2))(deconv1)
deconv2 = Conv2D(32, kernel_size=(3, 3), padding='same', strides=(1, 1), name='deconv2', activation='relu')(deconv1)
deconv2 = UpSampling2D((2, 2))(deconv2)
deconv3 = Conv2D(64, kernel_size=(3, 3), padding='same', strides=(1, 1), name='deconv3', activation='relu')(deconv2)
deconv3 = UpSampling2D((2, 2))(deconv3)
deconv4 = Conv2D(64, kernel_size=(3, 3), padding='same', strides=(1, 1), name='deconv4', activation='relu')(deconv3)
deconv4 = UpSampling2D((2, 2))(deconv4)
decoded = Conv2D(channel_num, kernel_size=(3, 3), padding='same', strides=(1, 1), name='deconvAAAA', activation="sigmoid")(deconv4)
return Model(inputs=input_tensor, outputs=decoded)
model = st_ae()
model.summary()
from tensorflow.keras.callbacks import ModelCheckpoint
filepath = './anomaly_st_ae_with_noise.h5'
checkpoint = ModelCheckpoint(filepath,
monitor='loss',
verbose=1,
save_best_only=True,
mode='min')
callbacks = [checkpoint]
model = tf.keras.models.load_model(filepath)
from tensorflow.keras.layers import Conv2DTranspose, ConvLSTM2D, BatchNormalization, TimeDistributed, Conv2D, LayerNormalization, Activation
# from keras_layer_normalization import LayerNormalization
model = tf.keras.models.Sequential()
"""Spatial Encoder with temporal seq (Time Distributed)"""
model.add(TimeDistributed(Conv2D(128, (11, 11), strides=4, padding="same"), batch_input_shape=(None, 10, 144, 144, 1)))
model.add(LayerNormalization())
model.add(TimeDistributed(Conv2D(64, (5, 5), strides=2, padding="same")))
model.add(LayerNormalization())
"""
Conv-LSTM bottleneck
"""
model.add(ConvLSTM2D(64, (3, 3), padding="same", return_sequences=True))
model.add(LayerNormalization())
model.add(ConvLSTM2D(32, (3, 3), padding="same", return_sequences=True))
model.add(LayerNormalization())
model.add(ConvLSTM2D(64, (3, 3), padding="same", return_sequences=True))
model.add(LayerNormalization())
"""Spatial Decoder with temporal seq (Time Distributed)"""
model.add(TimeDistributed(Conv2DTranspose(64, (5, 5), strides=2, padding="same")))
model.add(LayerNormalization())
model.add(TimeDistributed(Conv2DTranspose(128, (11, 11), strides=4, padding="same")))
model.add(LayerNormalization())
model.add(TimeDistributed(Conv2D(1, (11, 11), activation="sigmoid", padding="same")))
model.summary()
from tensorflow.keras.callbacks import ModelCheckpoint
filepath = './anomaly_lstm_without_noise.h5'
checkpoint = ModelCheckpoint(filepath,
monitor='loss',
verbose=1,
save_best_only=True,
mode='min')
callbacks = [checkpoint]
model = tf.keras.models.load_model(filepath)
model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=1e-4, decay=1e-5, epsilon=1e-6), metrics=[tf.keras.metrics.MeanSquaredError()])
model.fit(dataset, epochs=10, callbacks=callbacks)
model.save('anomaly_v1.h5')
# model.summary()
TEST_PATH = path + '/UCSD_Anomaly_Dataset.v1p2/UCSDped1/Test/Test024'
from PIL import Image
import os
def get_test_data():
sz = 200
test = np.zeros(shape=(sz, INPUT_SHAPE, INPUT_SHAPE, 1))
cnt = 0
for f in sorted(os.listdir(TEST_PATH)):
if str(os.path.join(TEST_PATH, f))[-3:] == "tif":
img = cv2.imread(os.path.join(TEST_PATH, f), 0)
img = cv2.resize(img, (INPUT_SHAPE, INPUT_SHAPE))
img = img/255.
test[cnt, :, :, 0] = img
cnt = cnt + 1
return test
import matplotlib.pyplot as plt
%matplotlib inline
seq_len = 10
test = get_test_data()
print(test.shape)
sz = test.shape[0] - seq_len + 1
sequences = np.zeros((sz, seq_len, INPUT_SHAPE, INPUT_SHAPE, 1))
# apply the sliding window technique to get the sequences
for i in range(0, sz):
clip = np.zeros((seq_len, INPUT_SHAPE, INPUT_SHAPE, 1))
for j in range(0, seq_len):
clip[j] = test[i + j, :, :, :]
sequences[i] = clip
# get the reconstruction cost of all the sequences
reconstructed_sequences = model.predict(sequences, batch_size=4)
sequences_reconstruction_cost = np.array([np.linalg.norm(np.subtract(sequences[i],reconstructed_sequences[i])) for i in range(0,sz)])
sa = (sequences_reconstruction_cost - np.min(sequences_reconstruction_cost)) / np.max(sequences_reconstruction_cost)
sr = 1.0 - sa
# plot the regularity scores
plt.plot(sr)
plt.ylabel('regularity score Sr(t)')
plt.xlabel('frame t')
plt.show()
import glob
import os
import cv2
import numpy as np
def generate_test_data(folder, stride=1, seq_len=10, noise=False):
'''
Generate test data (Split video to sequnces 10 frames long)
'''
frames_in_folder = []
for frame in sorted(glob.glob(folder+'/*')):
# print(frame)
frame = cv2.imread(frame, 0)
frame = cv2.resize(frame, (INPUT_SHAPE, INPUT_SHAPE))
frame = frame/255.
frame = np.expand_dims(frame, axis=-1)
frames_in_folder.append(frame)
print(len(frames_in_folder))
temp_seq = []
for i in range(0, len(frames_in_folder), seq_len):
# print('*********')
temp_arr = frames_in_folder[i:i+seq_len*stride:stride]
temp_arr = np.array(temp_arr)
# print(temp_arr.shape)
if temp_arr.shape[0]==seq_len:
temp_seq.append(temp_arr)
print(f'[INFO] processed {folder}')
return np.array(temp_seq)
TEST_PATH = path + '/UCSD_Anomaly_Dataset.v1p2/UCSDped1/Test/Test024'
testing_data = generate_test_data(TEST_PATH)
testing_data.shape
from scipy import signal
import matplotlib.pyplot as plt
%matplotlib inline
threshold = 3*255
def plot(img, output, diff, H, threshold, counter):
fig, (ax0, ax1, ax2,ax3) = plt.subplots(ncols=4, figsize=(10, 5))
ax0.set_axis_off()
ax1.set_axis_off()
ax2.set_axis_off()
ax0.set_title('input image')
ax1.set_title('reconstructed image')
ax2.set_title('difference ')
ax3.set_title('anomalies')
ax0.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
ax1.imshow(output, cmap=plt.cm.gray, interpolation='nearest')
ax2.imshow(diff, cmap=plt.cm.viridis, vmin=0, vmax=255, interpolation='nearest')
ax3.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
x,y = np.where(H > threshold)
ax3.scatter(y,x,color='red',s=0.1)
plt.axis('off')
plt.savefig(os.path.join(output_directory, 'frame_{:03d}.png'.format(counter+1)))
plt.show()
output_directory = './frames'
os.makedirs(output_directory, exist_ok=True)
counter = 0
for idx, index in enumerate(range(testing_data.shape[0])):
input_seq = testing_data[index, :,:,:]
print(idx, "------------------------")
output_seq = model.predict(np.expand_dims(input_seq, axis=0))
output_seq = np.squeeze(output_seq)
print(output_seq.shape)
# output_seq = output_seq.numpy()
for i in range(output_seq.shape[0]):
input_img = np.squeeze(input_seq[i])
output_img = np.squeeze(output_seq[i])
# plt.imshow(np.hstack((input_img, output_img)))
output = output_img*255
input = input_img*255
diff = np.abs(output-input)
H = signal.convolve2d(diff, np.ones((4,4)), mode='same')
plot(input, output, diff, H, threshold, counter)
counter += 1
!ffmpeg -y -i ./frames/frame_%03d.png -c:v libx264 -vf fps=25 -pix_fmt yuv420p out.mp4
testing_data.shape
TEST_PATH = path + '/UCSD_Anomaly_Dataset.v1p2/UCSDped1/Test/Test024'
files = sorted(glob.glob(TEST_PATH+'/*'))
len(files)
a = np.zeros((len(files),INPUT_SHAPE,INPUT_SHAPE, 1))
for idx, filename in enumerate(files):
img = cv2.imread(filename, 0)
img = cv2.resize(img, (INPUT_SHAPE,INPUT_SHAPE))
a[idx,:,:, 0] = np.array(img, dtype=np.float32)/255.0
testing_data = a
testing_data.shape
from scipy import signal
import matplotlib.pyplot as plt
import os
%matplotlib inline
threshold = 4*255
def plot(img, output, diff, H, threshold, counter):
fig, (ax0, ax1, ax2,ax3) = plt.subplots(ncols=4, figsize=(10, 5))
ax0.set_axis_off()
ax1.set_axis_off()
ax2.set_axis_off()
ax0.set_title('input image')
ax1.set_title('reconstructed image')
ax2.set_title('difference ')
ax3.set_title('anomalies')
ax0.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
ax1.imshow(output, cmap=plt.cm.gray, interpolation='nearest')
ax2.imshow(diff, cmap=plt.cm.viridis, vmin=0, vmax=255, interpolation='nearest')
ax3.imshow(img, cmap=plt.cm.gray, interpolation='nearest')
x,y = np.where(H > threshold)
ax3.scatter(y,x,color='red',s=0.1)
plt.axis('off')
plt.savefig(os.path.join(output_directory, 'frame_{:03d}.png'.format(counter+1)))
plt.show()
output_directory = './frames'
os.makedirs(output_directory, exist_ok=True)
counter = 0
for idx, index in enumerate(range(testing_data.shape[0])):
input_seq = testing_data[index, :,:,:]
print(idx, "------------------------")
output_seq = model.predict(np.expand_dims(input_seq, axis=0))
output_seq = np.squeeze(output_seq)
print(output_seq.shape)
# output_seq = output_seq.numpy()
# for i in range(output_seq.shape[0]):
input_img = np.squeeze(input_seq)
output_img = np.squeeze(output_seq)
output_img.shape
# # plt.imshow(np.hstack((input_img, output_img)))
output = output_img*255
input = input_img*255
diff = np.abs(output-input)
print(diff.shape, output.shape, input.shape)
H = signal.convolve2d(diff, np.ones((4,4)), mode='same')
plot(input, output, diff, H, threshold, counter)
counter += 1
!ffmpeg -y -i ./frames/frame_%03d.png -c:v libx264 -vf fps=25 -pix_fmt yuv420p out_st_ae_32.mp4