@inproceedings{a1ba78ca46b74277a856a7b64ed50f6c,
title = "A Simple Approach for Data Cleansing on Hadoop Framework using File Merging Technique",
abstract = "Hadoop framework is known for being top-notch in processing these huge files and providing useful data. Unfortunately, in a scenario with many small files, the framework is inefficient and fails to deliver. These small files cause many issues when the framework's processing criteria and performance levels. Moreover, these small files contain content that is useless or provides no benefit in the key-value decision-making. To overcome this issue of small files and unnecessary content, this paper proposes a simple data cleansing and file merging approach based on specific type and size that will not only be effective but will increase the framework's performance by approx. 68\%. This algorithm ensures the output will be a few huge files with essential/important data. The results show that the proposed system not only improves the framework's performance but also reduces deadlocks in the framework processes, which is approximately 68 \% improvement over the base Hadoop framework processing.",
keywords = "Big Data, Data Cleansing, HDFS, Hadoop",
author = "Adnan Ali and Mirza, \{Nada Masood\} and Rawad Bader and Ishak, \{Mohamad Khairi\}",
note = "Publisher Copyright: {\textcopyright} 2022 IEEE.; 9th International Conference on Software Defined Systems, SDS 2022 ; Conference date: 12-12-2022 Through 15-12-2022",
year = "2022",
doi = "10.1109/SDS57574.2022.10062900",
language = "English",
series = "2022 9th International Conference on Software Defined Systems, SDS 2022",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
editor = "Larbi Boubshir and Boubaker Daachi and Abdellah Mokrane and Yaser Jararweh and Benkhelifa Elhadj",
booktitle = "2022 9th International Conference on Software Defined Systems, SDS 2022",
address = "United States",
}