@conference{3330aba0ab564260baf56834e3a99144, author = "Stylianos Ioannis Mimilakis and Konstantinos Drossos and Jo{\~a}o F. Santos and Gerald Schuller and Tuomas Virtanen and Yoshua Bengio", abstract = "Singing voice separation based on deep learning relies on the usage of time-frequency masking. In many cases the masking process is not a learnable function or is not encapsulated into the deep learning optimization. Consequently, most of the existing methods rely on a post processing step using the generalized Wiener filtering. This work proposes a method that learns and optimizes (during training) a source-dependent mask and does not need the aforementioned post processing step. We introduce a recurrent inference algorithm, a sparse transformation step to improve the mask generation process, and a learned denoising filter. Obtained results show an increase of 0.49 dB for the signal to distortion ratio and 0.30 dB for the signal to interference ratio, compared to previous state-of-the-art approaches for monaural singing voice separation.", title = "{M}onaural {S}inging {V}oice {S}eparation with {S}kip-{F}iltering {C}onnections and {R}ecurrent {I}nference of {T}ime-{F}requency {M}ask", year = "2018", }